diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index 3efbbb9ff720506537f0ca7d43e5230a9979fe29..d8df2bfb2d16977367fe4b57b0f1b8da83ba0735 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1> static const size_t cells_per_dblock = 32; static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dblock_size = 128; - static const size_t parallel_dblocks = 4; + static const size_t parallel_dblocks = 1; }; template<> @@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2> static const size_t cells_per_dblock = 12; static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dblock_size = 128; - static const size_t parallel_dblocks = 4; + static const size_t parallel_dblocks = 1; }; template<> @@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3> static const size_t cells_per_dblock = 6; static const size_t dofs_per_dblock = num_bf * cells_per_dblock; static const size_t dblock_size = 128; - static const size_t parallel_dblocks = 4; + static const size_t parallel_dblocks = 1; }; template<> @@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz); void -gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out); \ No newline at end of file +gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out); \ No newline at end of file diff --git a/src/entity_data.cpp b/src/entity_data.cpp index dbf7ae52c72b2fe83d0933f5659bb72bde75a209..b4645797e61f4bce2588ec3e3cfbc64333ce47e8 100644 --- a/src/entity_data.cpp +++ b/src/entity_data.cpp @@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed) auto src_row = iO*ed.num_bf+i; auto src_col = j; auto dst_row = i; - auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) ); + auto dst_col = iO*4*ed.num_fluxes + j;//( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) ); lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col); } } diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 1c06b3ffd6c35baa0e03ed97a123363b9e4ef172..667d480c5b764d2ae0c24da934457e9a2ffe34cc 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, template<size_t K> __global__ void -gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, +gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex, const double * __restrict__ dets, double * __restrict__ lifted_flux, int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base) { @@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) { int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; - int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes); - acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs); + int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes); + acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs); } lifted_flux[cur_dof_offset] += acc; @@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, template<size_t K> void -launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) +launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out) { - const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads; + const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) num_blocks += 1; @@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) } void -gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out) +gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out) { switch (edg.a_order) { diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp index ee38029c153f0dc53fdf89915afe9086e6d879c9..6721e3ed54eca20cbcca620930861b726ba1d9b7 100644 --- a/tests/test_lifting_gpu.cpp +++ b/tests/test_lifting_gpu.cpp @@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order) edgs.push_back( std::move(edg) ); } - texture_allocator<double> PFdotn_gpu(PFdotn.data(), PFdotn.size()); + device_vector<double> PFdotn_gpu(PFdotn.data(), PFdotn.size()); device_vector<double> LiftF_gpu(LiftF.data(), LiftF.size()); for (auto& edg : edgs) { timecounter_gpu tc; tc.tic(); - gpu_compute_flux_lifting(edg, PFdotn_gpu.get_texture(), LiftF_gpu.data()); + gpu_compute_flux_lifting(edg, PFdotn_gpu.data(), LiftF_gpu.data()); double time = tc.toc(); auto num_cells = edg.num_all_elems; @@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order) std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; double flops = 3*(edg.num_bf)*4*edg.num_fluxes*num_cells; std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; + auto read_gbs = 8*4*edg.num_fluxes*num_cells/(1e9*time); + auto write_gbs = 8*edg.num_bf*num_cells/(1e9*time); + auto tot_gbs = read_gbs + write_gbs; + std::cout << "Read: " << read_gbs << " GB/s, write: " << write_gbs; + std::cout << " GB/s, total: " << tot_gbs << " GB/s" << std::endl; + } else {