From ce97b241fb4dd47a8c847129286649dbb184344d Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Sat, 1 May 2021 00:56:44 +0200 Subject: [PATCH] Testing perf optimization in lifting. --- src/entity_data.cpp | 24 +++++++++++++++++++++++- src/kernels_cuda.cu | 6 ++++-- tests/test_lifting_gpu.cpp | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/entity_data.cpp b/src/entity_data.cpp index 6f1d826..dbf7ae5 100644 --- a/src/entity_data.cpp +++ b/src/entity_data.cpp @@ -45,11 +45,33 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed) auto LMrows = ed.num_bf; auto LMcols = 4*ed.num_fluxes*ed.num_orientations; MatxCM<double> lm = MatxCM<double>::Zero(LMrows, LMcols); - for (size_t i = 0; i < ed.num_orientations; i++) + for (size_t iO = 0; iO < ed.num_orientations; iO++) { + /* lm.block(0, 4*i*ed.num_fluxes, ed.num_bf, 4*ed.num_fluxes) = ed.lifting_matrices.block(i*ed.num_bf, 0, ed.num_bf, 4*ed.num_fluxes); + */ + for (size_t i = 0; i < ed.num_bf; i++) + { + for (size_t j = 0; j < 4*ed.num_fluxes; j++) + { + auto src_row = iO*ed.num_bf+i; + auto src_col = j; + auto dst_row = i; + auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) ); + lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col); + } + } } +/* + matxd orig = ed.lifting_matrices.block(0, 0, ed.num_bf, 4*ed.num_fluxes); + matxd rotated = lm.block(0, 0, ed.num_bf, 4*ed.num_fluxes); + + std::cout << "*** ORIG ***" << std::endl; + std::cout << orig << std::endl; + std::cout << "*** ROTATED ***" << std::endl; + std::cout << rotated << std::endl; +*/ lifting_matrices.init(lm.data(), lm.size()); jacobians.copyin(ed.jacobians.data(), ed.jacobians.size()); diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 71e4405..c662a7c 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -135,11 +135,13 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO; double inv_det = 1./dets[iT]; + int32_t delta = ofs_in_entity % KS::num_bf; + double acc = 0.0; for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) { int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; - int32_t f_ofs = elem_flux_base + dof; + int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes); acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs); } @@ -150,7 +152,7 @@ template<size_t K> void launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) { - const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads; + const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) num_blocks += 1; diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp index c9277d1..c1b7957 100644 --- a/tests/test_lifting_gpu.cpp +++ b/tests/test_lifting_gpu.cpp @@ -252,7 +252,7 @@ int main(void) int failed_tests = 0; std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl; - for (size_t ao = 1; ao < 2; ao++) + for (size_t ao = 1; ao < 4; ao++) test_lifting(1, ao); gmsh::finalize(); -- GitLab