From ce97b241fb4dd47a8c847129286649dbb184344d Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Sat, 1 May 2021 00:56:44 +0200
Subject: [PATCH] Testing perf optimization in lifting.

---
 src/entity_data.cpp        | 24 +++++++++++++++++++++++-
 src/kernels_cuda.cu        |  6 ++++--
 tests/test_lifting_gpu.cpp |  2 +-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/entity_data.cpp b/src/entity_data.cpp
index 6f1d826..dbf7ae5 100644
--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -45,11 +45,33 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
     auto LMrows = ed.num_bf;
     auto LMcols = 4*ed.num_fluxes*ed.num_orientations;
     MatxCM<double> lm = MatxCM<double>::Zero(LMrows, LMcols);
-    for (size_t i = 0; i < ed.num_orientations; i++)
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
     {
+        /*
         lm.block(0, 4*i*ed.num_fluxes, ed.num_bf, 4*ed.num_fluxes) = 
             ed.lifting_matrices.block(i*ed.num_bf, 0, ed.num_bf, 4*ed.num_fluxes);
+        */
+        for (size_t i = 0; i < ed.num_bf; i++)
+        {
+            for (size_t j = 0; j < 4*ed.num_fluxes; j++)
+            {
+                auto src_row = iO*ed.num_bf+i;
+                auto src_col = j;
+                auto dst_row = i;
+                auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
+                lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col);
+            }
+        }
     }
+/*
+    matxd orig = ed.lifting_matrices.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
+    matxd rotated = lm.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
+
+    std::cout << "*** ORIG ***" << std::endl;
+    std::cout << orig << std::endl;
+    std::cout << "*** ROTATED ***" << std::endl;
+    std::cout << rotated << std::endl;
+*/
     lifting_matrices.init(lm.data(), lm.size());
 
     jacobians.copyin(ed.jacobians.data(), ed.jacobians.size());
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 71e4405..c662a7c 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -135,11 +135,13 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
     int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO;
     double inv_det = 1./dets[iT];
 
+    int32_t delta = ofs_in_entity % KS::num_bf;
+
     double acc = 0.0;
     for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
     {
         int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
-        int32_t f_ofs = elem_flux_base + dof;
+        int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes);
         acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs);
     }
 
@@ -150,7 +152,7 @@ template<size_t K>
 void
 launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
 {
-    const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads;
+    const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
     auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
     if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
         num_blocks += 1;
diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp
index c9277d1..c1b7957 100644
--- a/tests/test_lifting_gpu.cpp
+++ b/tests/test_lifting_gpu.cpp
@@ -252,7 +252,7 @@ int main(void)
     int failed_tests = 0;
 
     std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl;
-    for (size_t ao = 1; ao < 2; ao++)
+    for (size_t ao = 1; ao < 4; ao++)
             test_lifting(1, ao);
 
     gmsh::finalize();
-- 
GitLab