diff --git a/CMakeLists.txt b/CMakeLists.txt index a256dd0dad26554e6230fedddefe068706d27853..e1ece6ee8f84ea181a7dd770c7abd5d61eb98a3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -116,6 +116,11 @@ if (OPT_ENABLE_GPU_SOLVER) add_definitions(-D__HIP_PLATFORM_HCC__) endif() endif() + + option(OPT_BLOCKED_GPU_KERNELS "Use blocked kernels on GPU" OFF) + if (OPT_BLOCKED_GPU_KERNELS) + add_definitions(-DUSE_BLOCKED_GPU_KERNELS) + endif() endif() if (OPT_ENABLE_GPU_SOLVER) diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index f38f35b38eeb2ce8de3347c8156c0881ffa99ee1..3efbbb9ff720506537f0ca7d43e5230a9979fe29 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -10,41 +10,61 @@ struct kernel_gpu_sizes; template<> struct kernel_gpu_sizes<1> { - static const size_t num_bf = num_dofs_3D(1); - static const size_t num_fluxes = num_dofs_2D(1); + static const size_t num_bf = num_dofs_3D(1); //4 + static const size_t num_fluxes = num_dofs_2D(1); //3 static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 32; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; template<> struct kernel_gpu_sizes<2> { - static const size_t num_bf = num_dofs_3D(2); - static const size_t num_fluxes = num_dofs_2D(2); + static const size_t num_bf = num_dofs_3D(2); //10 + static const size_t num_fluxes = num_dofs_2D(2); //6 static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 12; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; template<> struct kernel_gpu_sizes<3> { - static const size_t num_bf = num_dofs_3D(3); - static const size_t num_fluxes = num_dofs_2D(3); + static const size_t num_bf = num_dofs_3D(3); //20 + static const size_t num_fluxes = num_dofs_2D(3); //10 static const size_t deriv_threads = 128; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 6; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; template<> struct kernel_gpu_sizes<4> { - static const size_t num_bf = num_dofs_3D(4); + static const size_t num_bf = num_dofs_3D(4); //35 static const size_t num_fluxes = num_dofs_2D(4); static const size_t deriv_threads = 512; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 3; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; template<> @@ -55,6 +75,11 @@ struct kernel_gpu_sizes<5> static const size_t deriv_threads = 1024; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 32; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; template<> @@ -65,19 +90,30 @@ struct kernel_gpu_sizes<6> static const size_t deriv_threads = 1024; static const size_t lifting_threads = 128; + + static const size_t cells_per_dblock = 32; + static const size_t dofs_per_dblock = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 4; }; struct kernel_gpu_sizes_runtime { size_t num_bf; + size_t cells_per_dblock; + size_t dblock_size; }; template<size_t N> kernel_gpu_sizes_runtime get_kernel_gpu_sizes() { kernel_gpu_sizes_runtime ret; + + using KS = kernel_gpu_sizes<N>; - ret.num_bf = kernel_gpu_sizes<N>::num_bf; + ret.num_bf = KS::num_bf; + ret.cells_per_dblock = KS::cells_per_dblock; + ret.dblock_size = KS::dblock_size; return ret; @@ -90,6 +126,8 @@ get_kernel_gpu_sizes(int approx_order) { case 1: return get_kernel_gpu_sizes<1>(); case 2: return get_kernel_gpu_sizes<2>(); + case 3: return get_kernel_gpu_sizes<3>(); + case 4: return get_kernel_gpu_sizes<4>(); } throw std::invalid_argument("Can't retrieve info for this element"); diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index c662a7c249de7d3cbce2c5f160d038c2607135fe..1c06b3ffd6c35baa0e03ed97a123363b9e4ef172 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -14,7 +14,7 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J, if (ofs_in_entity >= num_all_elems*KS::num_bf) return; - int32_t cur_dof_offset = dof_base + ofs_in_entity; + int32_t output_dof_offset = dof_base + ofs_in_entity; int32_t iT = ofs_in_entity / KS::num_bf; int32_t elem_dof_base = dof_base + iT*KS::num_bf; @@ -48,9 +48,68 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J, accm_dF_dz += J[jac_ofs+8] * v; } - dF_dx[cur_dof_offset] = accm_dF_dx; - dF_dy[cur_dof_offset] = accm_dF_dy; - dF_dz[cur_dof_offset] = accm_dF_dz; + dF_dx[output_dof_offset] = accm_dF_dx; + dF_dy[output_dof_offset] = accm_dF_dy; + dF_dz[output_dof_offset] = accm_dF_dz; +} + +template<size_t K> +__global__ void +gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J, + gpuTextureObject_t DM_tex, double * __restrict__ dF_dx, + double * __restrict__ dF_dy, double * __restrict__ dF_dz, + int32_t num_all_elems, int32_t* orients, int32_t dof_base) +{ + using KS = kernel_gpu_sizes<K>; + + if (threadIdx.x >= KS::dofs_per_dblock) + return; + + int32_t elem_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::cells_per_dblock; + int32_t elem_ofs = threadIdx.x / KS::num_bf; + int32_t iT = elem_base + elem_ofs; + + if (iT >= num_all_elems) + return; + + int32_t block_dof_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::dblock_size; + + int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x; + + int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf; + int32_t iO = orients[iT]; + int32_t DM_row = threadIdx.x % KS::num_bf; + int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO; + + double accm_dF_dx = 0.0; + double accm_dF_dy = 0.0; + double accm_dF_dz = 0.0; + int32_t jac_ofs = 9*iT; + for (int32_t dof = 0; dof < KS::num_bf; dof++) + { + int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof; + int32_t f_ofs = elem_dof_base + dof; + double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); + accm_dF_dx += J[jac_ofs+0] * v; + accm_dF_dy += J[jac_ofs+3] * v; + accm_dF_dz += J[jac_ofs+6] * v; + + d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf; + v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); + accm_dF_dx += J[jac_ofs+1] * v; + accm_dF_dy += J[jac_ofs+4] * v; + accm_dF_dz += J[jac_ofs+7] * v; + + d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf; + v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); + accm_dF_dx += J[jac_ofs+2] * v; + accm_dF_dy += J[jac_ofs+5] * v; + accm_dF_dz += J[jac_ofs+8] * v; + } + + dF_dx[output_dof_offset] = accm_dF_dx; + dF_dy[output_dof_offset] = accm_dF_dy; + dF_dz[output_dof_offset] = accm_dF_dz; } template<size_t K> @@ -58,22 +117,37 @@ void launch_deriv_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz) { - const auto THREADS_PER_BLOCK = kernel_gpu_sizes<K>::deriv_threads; - auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; - if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) - num_blocks += 1; - auto J = edg.jacobians.data(); auto Dtex = edg.differentiation_matrices.get_texture(); int32_t num_elems = edg.num_all_elems; auto orients = edg.element_orientation.data(); auto num_orients = edg.element_orientation.size(); + using KS = kernel_gpu_sizes<K>; + +#ifdef USE_BLOCKED_GPU_KERNELS + size_t num_blocks = edg.num_all_elems / (KS::cells_per_dblock * KS::parallel_dblocks); + if (num_blocks % (KS::cells_per_dblock * KS::parallel_dblocks)) + num_blocks += 1; + + dim3 grid_size(1, num_blocks); + dim3 block_size(KS::dblock_size, KS::parallel_dblocks); + if (edg.g_order == 1) + gpu_deriv_planar_blocked<K><<<grid_size, block_size>>>(f, J, + Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base); + //else + // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); +#else + auto num_blocks = edg.num_bf*edg.num_all_elems/KS::deriv_threads; + if (edg.num_bf*edg.num_all_elems % KS::deriv_threads) + num_blocks += 1; + if (edg.g_order == 1) - gpu_deriv_planar<K><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base); //else // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); +#endif } void @@ -152,7 +226,7 @@ template<size_t K> void launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) { - const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads; + const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) num_blocks += 1; diff --git a/src/kernels_gpu_mi.cpp b/src/kernels_gpu_mi.cpp index 4983265fddf7c3046bae0028c447b546781f899a..fb431a65cd16a5475246f6bbe13685bee23f30d8 100644 --- a/src/kernels_gpu_mi.cpp +++ b/src/kernels_gpu_mi.cpp @@ -1,6 +1,5 @@ #include "kernels_gpu.h" -#if 0 /* Return how many dblocks are needed to store this entity on GPU. */ size_t gpu_dblocks(const entity_data_cpu& ed) { @@ -37,7 +36,7 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg, { for (size_t j = 0; j < ks.cells_per_dblock; j++) { - auto cur_elem = ks.cells_per_dblock*i + j; + auto cur_elem = ks.cells_per_dblock*i + j; if (cur_elem >= tot_elems) break; @@ -53,4 +52,4 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg, } } } -#endif + diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp index a252212b6fa315d163d7544635e1ecb81b549dd9..4431c0a1297518700015d0e60ba3dd22f25bad81 100644 --- a/tests/test_differentiation_gpu.cpp +++ b/tests/test_differentiation_gpu.cpp @@ -89,12 +89,15 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde vecxd Pdf_dx = vecxd::Zero(model_num_dofs); vecxd Pdf_dy = vecxd::Zero(model_num_dofs); vecxd Pdf_dz = vecxd::Zero(model_num_dofs); - vecxd Cdf_dx = vecxd::Zero(model_num_dofs); - vecxd Cdf_dy = vecxd::Zero(model_num_dofs); - vecxd Cdf_dz = vecxd::Zero(model_num_dofs); std::vector<entity_data_gpu> edgs; +#ifdef USE_BLOCKED_GPU_KERNELS + std::vector<entity_data_cpu> eds; + size_t model_num_dofs_gpu = 0; +#else + size_t model_num_dofs_gpu = model_num_dofs; +#endif for (auto& e : mod) { e.project(f, Pf); @@ -104,14 +107,27 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde entity_data_cpu ed; e.populate_entity_data(ed); entity_data_gpu edg(ed); +#ifdef USE_BLOCKED_GPU_KERNELS + edg.dof_base = model_num_dofs_gpu; + model_num_dofs_gpu += gpu_dblocks_dofs(ed); + eds.push_back( std::move(ed) ); +#endif edgs.push_back( std::move(edg) ); } /* Prepare I/O vectors and call kernel */ +#ifdef USE_BLOCKED_GPU_KERNELS + assert(eds.size() == edgs.size()); + vecxd Pf_reshaped = vecxd::Zero(model_num_dofs_gpu); + for (size_t i = 0; i < eds.size(); i++) + reshape_dofs(eds[i], edgs[i], Pf, Pf_reshaped, true); + texture_allocator<double> Pf_gpu(Pf_reshaped.data(), Pf_reshaped.size()); +#else texture_allocator<double> Pf_gpu(Pf.data(), Pf.size()); - device_vector<double> df_dx_gpu(model_num_dofs); - device_vector<double> df_dy_gpu(model_num_dofs); - device_vector<double> df_dz_gpu(model_num_dofs); +#endif + device_vector<double> df_dx_gpu(model_num_dofs_gpu); + device_vector<double> df_dy_gpu(model_num_dofs_gpu); + device_vector<double> df_dz_gpu(model_num_dofs_gpu); for (auto& edg : edgs) { @@ -136,9 +152,37 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde } } +#ifdef USE_BLOCKED_GPU_KERNELS + vecxd Cdf_dx_exp = vecxd::Zero(model_num_dofs_gpu); + vecxd Cdf_dy_exp = vecxd::Zero(model_num_dofs_gpu); + vecxd Cdf_dz_exp = vecxd::Zero(model_num_dofs_gpu); + + df_dx_gpu.copyout( Cdf_dx_exp.data() ); + df_dy_gpu.copyout( Cdf_dy_exp.data() ); + df_dz_gpu.copyout( Cdf_dz_exp.data() ); + + vecxd Cdf_dx = vecxd::Zero(model_num_dofs); + vecxd Cdf_dy = vecxd::Zero(model_num_dofs); + vecxd Cdf_dz = vecxd::Zero(model_num_dofs); + + assert(eds.size() == edgs.size()); + Pf = vecxd::Zero(model_num_dofs); + for (size_t i = 0; i < eds.size(); i++) + { + reshape_dofs(eds[i], edgs[i], Cdf_dx_exp, Cdf_dx, false); + reshape_dofs(eds[i], edgs[i], Cdf_dy_exp, Cdf_dy, false); + reshape_dofs(eds[i], edgs[i], Cdf_dz_exp, Cdf_dz, false); + reshape_dofs(eds[i], edgs[i], Pf_reshaped, Pf, false); + } +#else + vecxd Cdf_dx = vecxd::Zero(model_num_dofs_gpu); + vecxd Cdf_dy = vecxd::Zero(model_num_dofs_gpu); + vecxd Cdf_dz = vecxd::Zero(model_num_dofs_gpu); + df_dx_gpu.copyout( Cdf_dx.data() ); df_dy_gpu.copyout( Cdf_dy.data() ); df_dz_gpu.copyout( Cdf_dz.data() ); +#endif double err_x = 0.0; double err_y = 0.0; @@ -173,6 +217,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde auto gi = e.cell_global_index_by_gmsh(iT); assert(gi < model_num_cells); vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.}); + var_Pf[gi] = Pf.segment(ofs, num_bf).dot(phi_bar); var_df_dx[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar); var_df_dy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar); var_df_dz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar); @@ -183,6 +228,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde } #ifdef WRITE_TEST_OUTPUTS + silodb.write_zonal_variable("f", var_Pf); silodb.write_zonal_variable("df_dx", var_df_dx); silodb.write_zonal_variable("df_dy", var_df_dy); silodb.write_zonal_variable("df_dz", var_df_dz); @@ -224,7 +270,7 @@ int main(void) std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl; for (size_t go = 1; go < 2; go++) - for (size_t ao = go; ao < 7; ao++) + for (size_t ao = go; ao < 5; ao++) failed_tests += test_differentiation_convergence(go, ao); return failed_tests; diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp index c1b795763901c070bb01a3114b066f787e3a338a..ee38029c153f0dc53fdf89915afe9086e6d879c9 100644 --- a/tests/test_lifting_gpu.cpp +++ b/tests/test_lifting_gpu.cpp @@ -252,7 +252,7 @@ int main(void) int failed_tests = 0; std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl; - for (size_t ao = 1; ao < 4; ao++) + for (size_t ao = 1; ao < 7; ao++) test_lifting(1, ao); gmsh::finalize();