Skip to content
Snippets Groups Projects
Commit d7deccc9 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Lifting perf tests

parent cbf819d4
No related branches found
No related tags found
No related merge requests found
...@@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1> ...@@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1>
static const size_t cells_per_dblock = 32; static const size_t cells_per_dblock = 32;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128; static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4; static const size_t parallel_dblocks = 1;
}; };
template<> template<>
...@@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2> ...@@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2>
static const size_t cells_per_dblock = 12; static const size_t cells_per_dblock = 12;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128; static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4; static const size_t parallel_dblocks = 1;
}; };
template<> template<>
...@@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3> ...@@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3>
static const size_t cells_per_dblock = 6; static const size_t cells_per_dblock = 6;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128; static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4; static const size_t parallel_dblocks = 1;
}; };
template<> template<>
...@@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, ...@@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz); gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz);
void void
gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out); gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out);
\ No newline at end of file \ No newline at end of file
...@@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed) ...@@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
auto src_row = iO*ed.num_bf+i; auto src_row = iO*ed.num_bf+i;
auto src_col = j; auto src_col = j;
auto dst_row = i; auto dst_row = i;
auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) ); auto dst_col = iO*4*ed.num_fluxes + j;//( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col); lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col);
} }
} }
......
...@@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, ...@@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
template<size_t K> template<size_t K>
__global__ void __global__ void
gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
const double * __restrict__ dets, double * __restrict__ lifted_flux, const double * __restrict__ dets, double * __restrict__ lifted_flux,
int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base) int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
{ {
...@@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, ...@@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
{ {
int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes); int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes);
acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs); acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
} }
lifted_flux[cur_dof_offset] += acc; lifted_flux[cur_dof_offset] += acc;
...@@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, ...@@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
template<size_t K> template<size_t K>
void void
launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out)
{ {
const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads; const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1; num_blocks += 1;
...@@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) ...@@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
} }
void void
gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out) gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out)
{ {
switch (edg.a_order) switch (edg.a_order)
{ {
......
...@@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order) ...@@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order)
edgs.push_back( std::move(edg) ); edgs.push_back( std::move(edg) );
} }
texture_allocator<double> PFdotn_gpu(PFdotn.data(), PFdotn.size()); device_vector<double> PFdotn_gpu(PFdotn.data(), PFdotn.size());
device_vector<double> LiftF_gpu(LiftF.data(), LiftF.size()); device_vector<double> LiftF_gpu(LiftF.data(), LiftF.size());
for (auto& edg : edgs) for (auto& edg : edgs)
{ {
timecounter_gpu tc; timecounter_gpu tc;
tc.tic(); tc.tic();
gpu_compute_flux_lifting(edg, PFdotn_gpu.get_texture(), LiftF_gpu.data()); gpu_compute_flux_lifting(edg, PFdotn_gpu.data(), LiftF_gpu.data());
double time = tc.toc(); double time = tc.toc();
auto num_cells = edg.num_all_elems; auto num_cells = edg.num_all_elems;
...@@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order) ...@@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order)
std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
double flops = 3*(edg.num_bf)*4*edg.num_fluxes*num_cells; double flops = 3*(edg.num_bf)*4*edg.num_fluxes*num_cells;
std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
auto read_gbs = 8*4*edg.num_fluxes*num_cells/(1e9*time);
auto write_gbs = 8*edg.num_bf*num_cells/(1e9*time);
auto tot_gbs = read_gbs + write_gbs;
std::cout << "Read: " << read_gbs << " GB/s, write: " << write_gbs;
std::cout << " GB/s, total: " << tot_gbs << " GB/s" << std::endl;
} }
else else
{ {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment