diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index 7b8f1b4d2a26885f4d7fe9596a3f45f8dd1187ac..36c8448e68396fb636dc4f2024959e1333de42b9 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -129,4 +129,4 @@ void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, void gpu_compute_field_derivatives(entity_data_gpu& edg, - const double* F, double *dF_dx, double* dF_dy, double* dF_dz); + gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz); diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index b3e83015e787e713876bcdbe58b93a690d4e134a..caa73b4cd161c62075b251b145f6f27478ad8c14 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -3,9 +3,10 @@ template<size_t K> __global__ void -gpu_deriv_planar(const double *F, const double *J, gpuTextureObject_t DM_tex, - double *dF_dx, double* dF_dy, double* dF_dz, int32_t num_all_elems, - int32_t* orients) +gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J, + gpuTextureObject_t DM_tex, double * __restrict__ dF_dx, + double * __restrict__ dF_dy, double * __restrict__ dF_dz, + int32_t num_all_elems, int32_t* orients) { using KS = kernel_gpu_sizes<K>; @@ -29,19 +30,19 @@ gpu_deriv_planar(const double *F, const double *J, gpuTextureObject_t DM_tex, { int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof; int32_t f_ofs = elem_dof_base + dof; - double v = fetch_tex(DM_tex, d_ofs) * F[f_ofs]; + double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); accm_dF_dx += J[jac_ofs+0] * v; accm_dF_dy += J[jac_ofs+3] * v; accm_dF_dz += J[jac_ofs+6] * v; d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf; - v = fetch_tex(DM_tex, d_ofs) * F[f_ofs]; + v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); accm_dF_dx += J[jac_ofs+1] * v; accm_dF_dy += J[jac_ofs+4] * v; accm_dF_dz += J[jac_ofs+7] * v; d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf; - v = fetch_tex(DM_tex, d_ofs) * F[f_ofs]; + v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs); accm_dF_dx += J[jac_ofs+2] * v; accm_dF_dy += J[jac_ofs+5] * v; accm_dF_dz += J[jac_ofs+8] * v; @@ -55,7 +56,7 @@ gpu_deriv_planar(const double *F, const double *J, gpuTextureObject_t DM_tex, template<size_t K> void launch_deriv_kernel(entity_data_gpu& edg, - const double* f, double *df_dx, double* df_dy, double* df_dz) + gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz) { const auto THREADS_PER_BLOCK = kernel_gpu_sizes<K>::deriv_threads; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; @@ -77,7 +78,7 @@ launch_deriv_kernel(entity_data_gpu& edg, void gpu_compute_field_derivatives(entity_data_gpu& edg, - const double* f, double *df_dx, double* df_dy, double* df_dz) + gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz) { @@ -108,6 +109,6 @@ gpu_compute_field_derivatives(entity_data_gpu& edg, break; default: - std::cout << "compute_field_derivatives: invalid order" << std::endl; + throw std::invalid_argument("compute_field_derivatives: invalid order"); } } \ No newline at end of file diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp index 84bb32549924eb81327d108f42f3fc2b49a8e6f4..ddf6eaa1d80991d27f11ddccad2f4d9e6a95aea3 100644 --- a/tests/test_differentiation_gpu.cpp +++ b/tests/test_differentiation_gpu.cpp @@ -69,14 +69,14 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde entity_data_gpu edg(ed); /* Prepare I/O vectors and call kernel */ - device_vector<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size()); + texture_allocator<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size()); device_vector<double> df_dx_gpu(Pf_cpu.size()); device_vector<double> df_dy_gpu(Pf_cpu.size()); device_vector<double> df_dz_gpu(Pf_cpu.size()); timecounter_gpu tc; tc.tic(); - gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(), + gpu_compute_field_derivatives(edg, Pf_gpu.get_texture(), df_dx_gpu.data(), df_dy_gpu.data(), df_dz_gpu.data()); double time = tc.toc(); @@ -162,7 +162,7 @@ int main(void) std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl; for (size_t go = 1; go < 2; go++) - for (size_t ao = go; ao < 6; ao++) + for (size_t ao = go; ao < 7; ao++) failed_tests += test_differentiation_convergence(go, ao); return failed_tests;