diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index f3a4c59a2475695cb287e62ed792f59f0b73c08a..7b8f1b4d2a26885f4d7fe9596a3f45f8dd1187ac 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -19,6 +19,8 @@ struct kernel_gpu_sizes<1> static const size_t dblock_bf = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 4; + + static const size_t deriv_threads = 128; }; template<> @@ -29,6 +31,8 @@ struct kernel_gpu_sizes<2> static const size_t dblock_bf = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 1; + + static const size_t deriv_threads = 128; }; template<> @@ -39,6 +43,8 @@ struct kernel_gpu_sizes<3> static const size_t dblock_bf = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 1; + + static const size_t deriv_threads = 128; }; template<> @@ -49,6 +55,8 @@ struct kernel_gpu_sizes<4> static const size_t dblock_bf = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 1; + + static const size_t deriv_threads = 512; }; template<> @@ -59,6 +67,20 @@ struct kernel_gpu_sizes<5> static const size_t dblock_bf = num_bf * cells_per_dblock; static const size_t dblock_size = 128; static const size_t parallel_dblocks = 1; + + static const size_t deriv_threads = 1024; +}; + +template<> +struct kernel_gpu_sizes<6> +{ + static const size_t num_bf = 84; + static const size_t cells_per_dblock = 12; + static const size_t dblock_bf = num_bf * cells_per_dblock; + static const size_t dblock_size = 128; + static const size_t parallel_dblocks = 1; + + static const size_t deriv_threads = 1024; }; struct kernel_gpu_sizes_runtime diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 80fb6880b784a08ca58bb09e43d27556e995e849..d4d0939237085ac8f48accbd18f20c33d55191ac 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -56,7 +56,6 @@ void gpu_compute_field_derivatives(entity_data_gpu& edg, const double* f, double *df_dx, double* df_dy, double* df_dz) { - const auto THREADS_PER_BLOCK = 128; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) num_blocks += 1; @@ -71,7 +70,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, { case 1: if (edg.g_order == 1) - gpu_deriv_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<1><<<num_blocks, kernel_gpu_sizes<1>::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else // compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz); @@ -79,7 +78,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, case 2: if (edg.g_order == 1) - gpu_deriv_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<2><<<num_blocks, kernel_gpu_sizes<2>::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else // compute_field_derivatives_kernel_curved<2>(ed, f, df_dx, df_dy, df_dz); @@ -87,7 +86,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, case 3: if (edg.g_order == 1) - gpu_deriv_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<3><<<num_blocks, kernel_gpu_sizes<3>::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else // compute_field_derivatives_kernel_curved<3>(ed, f, df_dx, df_dy, df_dz); @@ -95,7 +94,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, case 4: if (edg.g_order == 1) - gpu_deriv_planar<4><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<4><<<num_blocks, kernel_gpu_sizes<4>::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else // compute_field_derivatives_kernel_curved<4>(ed, f, df_dx, df_dy, df_dz); @@ -103,19 +102,19 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, case 5: if (edg.g_order == 1) - gpu_deriv_planar<5><<<num_blocks, THREADS_PER_BLOCK>>>(f, J, + gpu_deriv_planar<5><<<num_blocks, kernel_gpu_sizes<5>::deriv_threads>>>(f, J, Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else // compute_field_derivatives_kernel_curved<5>(ed, f, df_dx, df_dy, df_dz); break; -#if 0 + case 6: if (edg.g_order == 1) - gpu_compute_field_derivatives_kernel_planar<6>(edg, f, df_dx, df_dy, df_dz); + gpu_deriv_planar<6><<<num_blocks, kernel_gpu_sizes<6>::deriv_threads>>>(f, J, + Dtex, df_dx, df_dy, df_dz, num_elems, orients); //else - // compute_field_derivatives_kernel_curved<6>(ed, f, df_dx, df_dy, df_dz); + // compute_field_derivatives_kernel_curved<5>(ed, f, df_dx, df_dy, df_dz); break; -#endif default: std::cout << "compute_field_derivatives: invalid order" << std::endl; diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp index b4c48bb4517bd6300c8a32094a03dfeb52ec0a85..84bb32549924eb81327d108f42f3fc2b49a8e6f4 100644 --- a/tests/test_differentiation_gpu.cpp +++ b/tests/test_differentiation_gpu.cpp @@ -162,7 +162,7 @@ int main(void) std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl; for (size_t go = 1; go < 2; go++) - for (size_t ao = go; ao < 5; ao++) + for (size_t ao = go; ao < 6; ao++) failed_tests += test_differentiation_convergence(go, ao); return failed_tests;