diff --git a/CMakeLists.txt b/CMakeLists.txt
index a256dd0dad26554e6230fedddefe068706d27853..e1ece6ee8f84ea181a7dd770c7abd5d61eb98a3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -116,6 +116,11 @@ if (OPT_ENABLE_GPU_SOLVER)
             add_definitions(-D__HIP_PLATFORM_HCC__)
         endif()
     endif()
+
+    option(OPT_BLOCKED_GPU_KERNELS "Use blocked kernels on GPU" OFF)
+    if (OPT_BLOCKED_GPU_KERNELS)
+        add_definitions(-DUSE_BLOCKED_GPU_KERNELS)
+    endif()
 endif()
 
 if (OPT_ENABLE_GPU_SOLVER)
diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h
index f38f35b38eeb2ce8de3347c8156c0881ffa99ee1..3efbbb9ff720506537f0ca7d43e5230a9979fe29 100644
--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -10,41 +10,61 @@ struct kernel_gpu_sizes;
 template<>
 struct kernel_gpu_sizes<1>
 {
-    static const size_t num_bf              = num_dofs_3D(1);
-    static const size_t num_fluxes          = num_dofs_2D(1);
+    static const size_t num_bf              = num_dofs_3D(1); //4
+    static const size_t num_fluxes          = num_dofs_2D(1); //3
 
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
+
+    static const size_t cells_per_dblock    = 32;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 template<>
 struct kernel_gpu_sizes<2>
 {
-    static const size_t num_bf              = num_dofs_3D(2);
-    static const size_t num_fluxes          = num_dofs_2D(2);
+    static const size_t num_bf              = num_dofs_3D(2); //10
+    static const size_t num_fluxes          = num_dofs_2D(2); //6
 
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
+
+    static const size_t cells_per_dblock    = 12;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 template<>
 struct kernel_gpu_sizes<3>
 {
-    static const size_t num_bf              = num_dofs_3D(3);
-    static const size_t num_fluxes          = num_dofs_2D(3);
+    static const size_t num_bf              = num_dofs_3D(3); //20
+    static const size_t num_fluxes          = num_dofs_2D(3); //10
 
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
+
+    static const size_t cells_per_dblock    = 6;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 template<>
 struct kernel_gpu_sizes<4>
 {
-    static const size_t num_bf              = num_dofs_3D(4);
+    static const size_t num_bf              = num_dofs_3D(4); //35
     static const size_t num_fluxes          = num_dofs_2D(4);
 
     static const size_t deriv_threads       = 512;
     static const size_t lifting_threads     = 128;
+
+    static const size_t cells_per_dblock    = 3;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 template<>
@@ -55,6 +75,11 @@ struct kernel_gpu_sizes<5>
 
     static const size_t deriv_threads       = 1024;
     static const size_t lifting_threads     = 128;
+
+        static const size_t cells_per_dblock    = 32;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 template<>
@@ -65,19 +90,30 @@ struct kernel_gpu_sizes<6>
 
     static const size_t deriv_threads       = 1024;
     static const size_t lifting_threads     = 128;
+
+        static const size_t cells_per_dblock    = 32;
+    static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 4;
 };
 
 struct kernel_gpu_sizes_runtime
 {
     size_t      num_bf;
+    size_t      cells_per_dblock;
+    size_t      dblock_size;
 };
 
 template<size_t N>
 kernel_gpu_sizes_runtime get_kernel_gpu_sizes()
 {
     kernel_gpu_sizes_runtime ret;
+    
+    using KS = kernel_gpu_sizes<N>;
 
-    ret.num_bf              = kernel_gpu_sizes<N>::num_bf;
+    ret.num_bf              = KS::num_bf;
+    ret.cells_per_dblock    = KS::cells_per_dblock;
+    ret.dblock_size         = KS::dblock_size;
 
 
     return ret;
@@ -90,6 +126,8 @@ get_kernel_gpu_sizes(int approx_order)
     {
         case 1:     return get_kernel_gpu_sizes<1>();
         case 2:     return get_kernel_gpu_sizes<2>();
+        case 3:     return get_kernel_gpu_sizes<3>();
+        case 4:     return get_kernel_gpu_sizes<4>();
     }
 
     throw std::invalid_argument("Can't retrieve info for this element");
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index c662a7c249de7d3cbce2c5f160d038c2607135fe..1c06b3ffd6c35baa0e03ed97a123363b9e4ef172 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -14,7 +14,7 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J,
     if (ofs_in_entity >= num_all_elems*KS::num_bf)
         return;
 
-    int32_t cur_dof_offset = dof_base + ofs_in_entity;
+    int32_t output_dof_offset = dof_base + ofs_in_entity;
 
     int32_t iT = ofs_in_entity / KS::num_bf;
     int32_t elem_dof_base = dof_base + iT*KS::num_bf;
@@ -48,9 +48,68 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J,
         accm_dF_dz += J[jac_ofs+8] * v;
     }
 
-    dF_dx[cur_dof_offset] = accm_dF_dx;
-    dF_dy[cur_dof_offset] = accm_dF_dy;
-    dF_dz[cur_dof_offset] = accm_dF_dz;
+    dF_dx[output_dof_offset] = accm_dF_dx;
+    dF_dy[output_dof_offset] = accm_dF_dy;
+    dF_dz[output_dof_offset] = accm_dF_dz;
+}
+
+template<size_t K>
+__global__ void
+gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
+    gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
+    double * __restrict__ dF_dy, double * __restrict__ dF_dz,
+    int32_t num_all_elems, int32_t* orients, int32_t dof_base)
+{
+    using KS = kernel_gpu_sizes<K>;
+
+    if (threadIdx.x >= KS::dofs_per_dblock)
+        return;
+
+    int32_t elem_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::cells_per_dblock;
+    int32_t elem_ofs = threadIdx.x / KS::num_bf;
+    int32_t iT = elem_base + elem_ofs;
+
+    if (iT >= num_all_elems)
+        return;
+    
+    int32_t block_dof_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::dblock_size;
+
+    int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
+
+    int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
+    int32_t iO = orients[iT];
+    int32_t DM_row = threadIdx.x % KS::num_bf;
+    int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+
+    double accm_dF_dx = 0.0;
+    double accm_dF_dy = 0.0;
+    double accm_dF_dz = 0.0;
+    int32_t jac_ofs = 9*iT;
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        accm_dF_dx += J[jac_ofs+0] * v;
+        accm_dF_dy += J[jac_ofs+3] * v;
+        accm_dF_dz += J[jac_ofs+6] * v;
+    
+        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        accm_dF_dx += J[jac_ofs+1] * v;
+        accm_dF_dy += J[jac_ofs+4] * v;
+        accm_dF_dz += J[jac_ofs+7] * v;
+    
+        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        accm_dF_dx += J[jac_ofs+2] * v;
+        accm_dF_dy += J[jac_ofs+5] * v;
+        accm_dF_dz += J[jac_ofs+8] * v;
+    }
+
+    dF_dx[output_dof_offset] = accm_dF_dx;
+    dF_dy[output_dof_offset] = accm_dF_dy;
+    dF_dz[output_dof_offset] = accm_dF_dz;
 }
 
 template<size_t K>
@@ -58,22 +117,37 @@ void
 launch_deriv_kernel(entity_data_gpu& edg,
     gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz)
 {
-    const auto THREADS_PER_BLOCK = kernel_gpu_sizes<K>::deriv_threads;
-    auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
-    if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
-        num_blocks += 1;
-
     auto J = edg.jacobians.data();
     auto Dtex = edg.differentiation_matrices.get_texture();
     int32_t num_elems = edg.num_all_elems;
     auto orients = edg.element_orientation.data();
     auto num_orients = edg.element_orientation.size();
 
+    using KS = kernel_gpu_sizes<K>;
+
+#ifdef USE_BLOCKED_GPU_KERNELS
+    size_t num_blocks = edg.num_all_elems / (KS::cells_per_dblock * KS::parallel_dblocks);
+    if (num_blocks % (KS::cells_per_dblock * KS::parallel_dblocks))
+        num_blocks += 1;
+
+    dim3 grid_size(1, num_blocks);
+    dim3 block_size(KS::dblock_size, KS::parallel_dblocks);
+    if (edg.g_order == 1)
+        gpu_deriv_planar_blocked<K><<<grid_size, block_size>>>(f, J,
+            Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
+    //else
+    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
+#else
+    auto num_blocks = edg.num_bf*edg.num_all_elems/KS::deriv_threads;
+    if (edg.num_bf*edg.num_all_elems % KS::deriv_threads)
+        num_blocks += 1;
+
     if (edg.g_order == 1)
-        gpu_deriv_planar<K><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
+        gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads>>>(f, J,
             Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
     //else
     //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
+#endif
 }
 
 void
@@ -152,7 +226,7 @@ template<size_t K>
 void
 launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
 {
-    const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
+    const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads;
     auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
     if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
         num_blocks += 1;
diff --git a/src/kernels_gpu_mi.cpp b/src/kernels_gpu_mi.cpp
index 4983265fddf7c3046bae0028c447b546781f899a..fb431a65cd16a5475246f6bbe13685bee23f30d8 100644
--- a/src/kernels_gpu_mi.cpp
+++ b/src/kernels_gpu_mi.cpp
@@ -1,6 +1,5 @@
 #include "kernels_gpu.h"
 
-#if 0
 /* Return how many dblocks are needed to store this entity on GPU. */
 size_t gpu_dblocks(const entity_data_cpu& ed)
 {
@@ -37,7 +36,7 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
     {
         for (size_t j = 0; j < ks.cells_per_dblock; j++)
         {
-            auto cur_elem =  ks.cells_per_dblock*i + j;
+            auto cur_elem = ks.cells_per_dblock*i + j;
             if (cur_elem >= tot_elems)
                 break;
             
@@ -53,4 +52,4 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
         }
     }
 }
-#endif
+
diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp
index a252212b6fa315d163d7544635e1ecb81b549dd9..4431c0a1297518700015d0e60ba3dd22f25bad81 100644
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -89,12 +89,15 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         vecxd Pdf_dx    = vecxd::Zero(model_num_dofs);
         vecxd Pdf_dy    = vecxd::Zero(model_num_dofs);
         vecxd Pdf_dz    = vecxd::Zero(model_num_dofs);
-        vecxd Cdf_dx    = vecxd::Zero(model_num_dofs);
-        vecxd Cdf_dy    = vecxd::Zero(model_num_dofs);
-        vecxd Cdf_dz    = vecxd::Zero(model_num_dofs);
 
         std::vector<entity_data_gpu> edgs;
 
+#ifdef USE_BLOCKED_GPU_KERNELS
+        std::vector<entity_data_cpu> eds;
+        size_t model_num_dofs_gpu = 0;
+#else
+        size_t model_num_dofs_gpu = model_num_dofs;
+#endif
         for (auto& e : mod)
         {
             e.project(f, Pf);
@@ -104,14 +107,27 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
             entity_data_cpu ed;
             e.populate_entity_data(ed);
             entity_data_gpu edg(ed);
+#ifdef USE_BLOCKED_GPU_KERNELS
+            edg.dof_base = model_num_dofs_gpu;
+            model_num_dofs_gpu += gpu_dblocks_dofs(ed);
+            eds.push_back( std::move(ed) );
+#endif
             edgs.push_back( std::move(edg) );
         }
 
          /* Prepare I/O vectors and call kernel */
+#ifdef USE_BLOCKED_GPU_KERNELS
+        assert(eds.size() == edgs.size());
+        vecxd Pf_reshaped = vecxd::Zero(model_num_dofs_gpu);
+        for (size_t i = 0; i < eds.size(); i++)
+            reshape_dofs(eds[i], edgs[i], Pf, Pf_reshaped, true);
+        texture_allocator<double> Pf_gpu(Pf_reshaped.data(), Pf_reshaped.size());
+#else
         texture_allocator<double> Pf_gpu(Pf.data(), Pf.size());
-        device_vector<double> df_dx_gpu(model_num_dofs);
-        device_vector<double> df_dy_gpu(model_num_dofs);
-        device_vector<double> df_dz_gpu(model_num_dofs);
+#endif
+        device_vector<double> df_dx_gpu(model_num_dofs_gpu);
+        device_vector<double> df_dy_gpu(model_num_dofs_gpu);
+        device_vector<double> df_dz_gpu(model_num_dofs_gpu);
 
         for (auto& edg : edgs)
         {
@@ -136,9 +152,37 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
             }
         }
 
+#ifdef USE_BLOCKED_GPU_KERNELS
+        vecxd Cdf_dx_exp = vecxd::Zero(model_num_dofs_gpu);
+        vecxd Cdf_dy_exp = vecxd::Zero(model_num_dofs_gpu);
+        vecxd Cdf_dz_exp = vecxd::Zero(model_num_dofs_gpu);
+
+        df_dx_gpu.copyout( Cdf_dx_exp.data() );
+        df_dy_gpu.copyout( Cdf_dy_exp.data() );
+        df_dz_gpu.copyout( Cdf_dz_exp.data() );
+
+        vecxd Cdf_dx = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dy = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dz = vecxd::Zero(model_num_dofs);
+
+        assert(eds.size() == edgs.size());
+        Pf = vecxd::Zero(model_num_dofs);
+        for (size_t i = 0; i < eds.size(); i++)
+        {
+            reshape_dofs(eds[i], edgs[i], Cdf_dx_exp, Cdf_dx, false);
+            reshape_dofs(eds[i], edgs[i], Cdf_dy_exp, Cdf_dy, false);
+            reshape_dofs(eds[i], edgs[i], Cdf_dz_exp, Cdf_dz, false);
+            reshape_dofs(eds[i], edgs[i], Pf_reshaped, Pf, false);
+        }
+#else
+        vecxd Cdf_dx = vecxd::Zero(model_num_dofs_gpu);
+        vecxd Cdf_dy = vecxd::Zero(model_num_dofs_gpu);
+        vecxd Cdf_dz = vecxd::Zero(model_num_dofs_gpu);
+
         df_dx_gpu.copyout( Cdf_dx.data() );
         df_dy_gpu.copyout( Cdf_dy.data() );
         df_dz_gpu.copyout( Cdf_dz.data() );
+#endif
         
         double err_x = 0.0;
         double err_y = 0.0;
@@ -173,6 +217,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
                 auto gi = e.cell_global_index_by_gmsh(iT);
                 assert(gi < model_num_cells);
                 vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+                var_Pf[gi] = Pf.segment(ofs, num_bf).dot(phi_bar);
                 var_df_dx[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar);
                 var_df_dy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar);
                 var_df_dz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar);
@@ -183,6 +228,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         }
 
 #ifdef WRITE_TEST_OUTPUTS
+        silodb.write_zonal_variable("f", var_Pf);
         silodb.write_zonal_variable("df_dx", var_df_dx);
         silodb.write_zonal_variable("df_dy", var_df_dy);
         silodb.write_zonal_variable("df_dz", var_df_dz);
@@ -224,7 +270,7 @@ int main(void)
 
     std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
     for (size_t go = 1; go < 2; go++)
-        for (size_t ao = go; ao < 7; ao++)
+        for (size_t ao = go; ao < 5; ao++)
             failed_tests += test_differentiation_convergence(go, ao);
 
     return failed_tests;
diff --git a/tests/test_lifting_gpu.cpp b/tests/test_lifting_gpu.cpp
index c1b795763901c070bb01a3114b066f787e3a338a..ee38029c153f0dc53fdf89915afe9086e6d879c9 100644
--- a/tests/test_lifting_gpu.cpp
+++ b/tests/test_lifting_gpu.cpp
@@ -252,7 +252,7 @@ int main(void)
     int failed_tests = 0;
 
     std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl;
-    for (size_t ao = 1; ao < 4; ao++)
+    for (size_t ao = 1; ao < 7; ao++)
             test_lifting(1, ao);
 
     gmsh::finalize();