diff --git a/src/entity_data.cpp b/src/entity_data.cpp
index 6f1d826a4d43f317a8a8301fa2a2b7bf202906b8..dbf7ae52c72b2fe83d0933f5659bb72bde75a209 100644
--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -45,11 +45,33 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
     auto LMrows = ed.num_bf;
     auto LMcols = 4*ed.num_fluxes*ed.num_orientations;
     MatxCM<double> lm = MatxCM<double>::Zero(LMrows, LMcols);
-    for (size_t i = 0; i < ed.num_orientations; i++)
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
     {
+        /*
         lm.block(0, 4*i*ed.num_fluxes, ed.num_bf, 4*ed.num_fluxes) = 
             ed.lifting_matrices.block(i*ed.num_bf, 0, ed.num_bf, 4*ed.num_fluxes);
+        */
+        for (size_t i = 0; i < ed.num_bf; i++)
+        {
+            for (size_t j = 0; j < 4*ed.num_fluxes; j++)
+            {
+                auto src_row = iO*ed.num_bf+i;
+                auto src_col = j;
+                auto dst_row = i;
+                auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
+                lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col);
+            }
+        }
     }
+/*
+    matxd orig = ed.lifting_matrices.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
+    matxd rotated = lm.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
+
+    std::cout << "*** ORIG ***" << std::endl;
+    std::cout << orig << std::endl;
+    std::cout << "*** ROTATED ***" << std::endl;
+    std::cout << rotated << std::endl;
+*/
     lifting_matrices.init(lm.data(), lm.size());
 
     jacobians.copyin(ed.jacobians.data(), ed.jacobians.size());
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 71e440537d89af5b9e2ce4f9ec636d29d548bcfb..c662a7c249de7d3cbce2c5f160d038c2607135fe 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -135,11 +135,13 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
     int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO;
     double inv_det = 1./dets[iT];
 
+    int32_t delta = ofs_in_entity % KS::num_bf;
+
     double acc = 0.0;
     for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
     {
         int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
-        int32_t f_ofs = elem_flux_base + dof;
+        int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes);
         acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs);
     }
 
@@ -150,7 +152,7 @@ template<size_t K>
 void
 launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
 {
-    const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads;
+    const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
     auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
     if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
         num_blocks += 1;
diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp
index c9277d143e99b68893f31e9a4653cef0335169fd..c1b795763901c070bb01a3114b066f787e3a338a 100644
--- a/tests/test_lifting_gpu.cpp
+++ b/tests/test_lifting_gpu.cpp
@@ -252,7 +252,7 @@ int main(void)
     int failed_tests = 0;
 
     std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl;
-    for (size_t ao = 1; ao < 2; ao++)
+    for (size_t ao = 1; ao < 4; ao++)
             test_lifting(1, ao);
 
     gmsh::finalize();