diff --git a/CMakeLists.txt b/CMakeLists.txt
index 727b412268ccc4275bd94617ef8f6b232d219d68..a4e22120f4275435d52c5c5fe3a3cba5cff03041 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,6 @@ link_directories(${GMSH_LIB})
 
 ######################################################################
 ## Use or not use GPU solvers
-
 option(OPT_ENABLE_GPU_SOLVER "Enable GPU solvers" OFF)
 if (OPT_ENABLE_GPU_SOLVER)
     add_definitions(-DENABLE_GPU_SOLVER)
@@ -117,9 +116,14 @@ if (OPT_ENABLE_GPU_SOLVER)
             add_definitions(-D__HIP_PLATFORM_HCC__)
         endif()
     endif()
-
 endif()
 
+if (OPT_ENABLE_GPU_SOLVER)
+    option(OPT_DEBUG_GPU "Enable GPU debugging" OFF)
+    if (OPT_DEBUG_GPU)
+        add_definitions(-DGPU_DEBUG)
+    endif()
+endif()
 
 ######################################################################
 ## Compiler identification stuff
@@ -235,12 +239,15 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 set(COMMON_SOURCES gmsh_io.cpp reference_element.cpp physical_element.cpp entity.cpp kernels_cpu.cpp connectivity.cpp entity_data.cpp)
 
 set(LIBGMSHDG_SOURCES src/gmsh_io.cpp
+                      src/entity.cpp
                       src/reference_element.cpp
                       src/physical_element.cpp
-                      src/entity.cpp
-                      src/kernels_cpu.cpp
                       src/connectivity.cpp
-                      src/entity_data.cpp)
+                      src/entity_data.cpp
+                      src/kernels_cpu.cpp
+                      src/kernels_gpu_mi.cpp
+                      src/kernels_cuda.cu
+                    )
 
 add_library(gmshdg SHARED ${LIBGMSHDG_SOURCES})
 target_link_libraries(gmshdg ${LINK_LIBS})
diff --git a/include/eigen.h b/include/eigen.h
index 8475f3f459d0463d5bf592141b04d9ea67bccc1b..16d1dd5926953853b5663c473b065649b091b13f 100644
--- a/include/eigen.h
+++ b/include/eigen.h
@@ -13,4 +13,5 @@ template<typename T>
 using MatxCM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>;
 
 template<typename T>
-using MatxRM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
\ No newline at end of file
+using MatxRM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
diff --git a/include/entity_data.h b/include/entity_data.h
index 01c968e463b7a383966f7d569c78361ef9c51384..43681cdb18c242b9abe193f36a83ca5321d774cf 100644
--- a/include/entity_data.h
+++ b/include/entity_data.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <numeric>
+
 #include "eigen.h"
 #include "types.h"
 
@@ -57,13 +59,20 @@ struct entity_data_cpu
     vecxd               face_determinants;
 };
 
+inline size_t
+num_elems_all_orientations(const entity_data_cpu& ed)
+{
+    assert(ed.num_elems.size() == ed.num_orientations);
+    return std::accumulate(ed.num_elems.begin(), ed.num_elems.end(), 0);
+}
+
 #ifdef ENABLE_GPU_SOLVER
 struct entity_data_gpu
 {
     int                 g_order;
     int                 a_order;
-    size_t*             num_elems;
-    size_t              num_orientations;
+    size_t              num_all_elems;
+    size_t *            element_orientation;
     size_t              num_bf;
 
     size_t              dof_base;
@@ -79,13 +88,12 @@ struct entity_data_gpu
     ~entity_data_gpu();
 };
 
-
 template<typename T>
 void
 gpu_copyin(const std::vector<T> src, T** dst)
 {
     checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }
 
 template<typename T>
@@ -93,7 +101,7 @@ void
 gpu_copyin(const MatxCM<T>& src, T** dst)
 {
     checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }
 
 template<typename T>
@@ -101,7 +109,7 @@ void
 gpu_copyin(const MatxRM<T>& src, T** dst)
 {
     checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }
 
 template<typename T>
@@ -109,7 +117,9 @@ void
 gpu_copyin(const Eigen::Matrix<T, Eigen::Dynamic, 1>& src, double** dst)
 {
     checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }
 
-#endif /* ENABLE_GPU_SOLVER */
\ No newline at end of file
+#endif /* ENABLE_GPU_SOLVER */
+
+
diff --git a/include/gpu_support.hpp b/include/gpu_support.hpp
index 26963c7878c5a9d60fcbcc2ad7b28900e2d717a9..758c2e28992759c04ecac92e22a394e4a9265176 100644
--- a/include/gpu_support.hpp
+++ b/include/gpu_support.hpp
@@ -168,10 +168,10 @@ public:
     device_vector(T *src, size_t size)
         : d_vptr(nullptr), vsize(0)
     {
-        init(src, size);
+        copyin(src, size);
     }
 
-    void init(T *src, size_t size)
+    void copyin(T *src, size_t size)
     {
         if (d_vptr != nullptr)
             (void) gpuFree(d_vptr);
@@ -181,6 +181,14 @@ public:
         vsize = size;
     }
 
+    void copyout(T *dst)
+    {
+        if (d_vptr == nullptr)
+            return;
+
+        (void)checkGPU( gpuMemcpy(dst, d_vptr, vsize*sizeof(T), gpuMemcpyDeviceToHost) );
+    }
+
     void resize(size_t size)
     {
         if (d_vptr != nullptr)
@@ -353,7 +361,7 @@ public:
         float elapsed;
         gpuEventSynchronize(stop);
         gpuEventElapsedTime(&elapsed, start, stop);
-        return elapsed/1000.0;
+        return double(elapsed)/1000.0;
     }
 };
 
diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h
index 1d6b79779534348a7eb68c958162eff36cbd25c2..1c45af64c9d13acad4d6a03797ca48fa8f900418 100644
--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -2,9 +2,12 @@
 
 #include "entity_data.h"
 
-template<size_t K>
-struct kernel_gpu_sizes;
-/* Compile-time constants configuring sizes for the kernels */
+/* Compile-time constants configuring sizes for the kernels.
+ * Elements are organized in blocks of cells_per_dblock elements each.
+ * Each dblock contains dblock_size dofs, where dblock_size must be a
+ * multiple of the warp size. This means that some dofs in the dblock
+ * will be padding. The effective dofs used are dblock_bf.*/
+
 template<size_t K>
 struct kernel_gpu_sizes;
 
@@ -13,7 +16,7 @@ struct kernel_gpu_sizes<1>
 {
     static const size_t num_bf              = 4;
     static const size_t cells_per_dblock    = 32;
-    static const size_t dblock_bf           = num_bf * vols_per_ublock;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
     static const size_t dblock_size         = 128;
     static const size_t parallel_dblocks    = 4;
 };
@@ -23,7 +26,17 @@ struct kernel_gpu_sizes<2>
 {
     static const size_t num_bf              = 10;
     static const size_t cells_per_dblock    = 12;
-    static const size_t dblock_bf           = num_bf * vols_per_ublock;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 1;
+};
+
+template<>
+struct kernel_gpu_sizes<3>
+{
+    static const size_t num_bf              = 20;
+    static const size_t cells_per_dblock    = 12;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
     static const size_t dblock_size         = 128;
     static const size_t parallel_dblocks    = 1;
 };
@@ -38,8 +51,7 @@ struct kernel_gpu_sizes_runtime
 };
 
 template<size_t N>
-kernel_gpu_sizes_runtime
-get_kernel_gpu_sizes()
+kernel_gpu_sizes_runtime get_kernel_gpu_sizes()
 {
     kernel_gpu_sizes_runtime ret;
 
@@ -47,13 +59,12 @@ get_kernel_gpu_sizes()
     ret.cells_per_dblock    = kernel_gpu_sizes<N>::cells_per_dblock;
     ret.dblock_bf           = kernel_gpu_sizes<N>::dblock_bf;
     ret.dblock_size         = kernel_gpu_sizes<N>::dblock_size;
-    ret.parallel_dblocks    = kernel_gpu_sizes<N>::slice_size;
+    ret.parallel_dblocks    = kernel_gpu_sizes<N>::parallel_dblocks;
 
     return ret;
 };
 
-inline
-kernel_gpu_sizes_runtime
+inline kernel_gpu_sizes_runtime
 get_kernel_gpu_sizes(int approx_order)
 {
     switch (approx_order)
@@ -65,49 +76,15 @@ get_kernel_gpu_sizes(int approx_order)
     throw std::invalid_argument("Can't retrieve info for this element");
 }
 
-/* Return the number of dofs that this entity will take on GPU,
- * including all the padding dofs */
-size_t gpu_dofs(const entity_data_cpu& ed)
-{
-    auto ks = get_kernel_gpu_sizes(ed.a_order);
-
-    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
-    if (num_dblocks % ks.cells_per_dblock)
-        num_dblocks += 1;
-
-    return num_dblocks * ks.dblock_size;
-}
+size_t gpu_dblocks(const entity_data_cpu&);
+size_t gpu_dblocks_dofs(const entity_data_cpu&);
 
 #define TO_GPU      true
 #define FROM_GPU    false
+void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, vecxd&, bool);
 
-void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
-                  const vecxd& in_field, vecxd& out_field, bool to_gpu)
-{
-    auto ks = get_kernel_gpu_sizes(ed.a_order);
 
-    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
-    if (num_dblocks % ks.cells_per_dblock)
-        num_dblocks += 1;
-
-    for (size_t i = 0; i < num_dblocks; i++)
-    {
-        for (size_t j = 0; j < ks.cells_per_dblock; j++)
-        {
-            auto cur_elem =  ks.cells_per_dblock*i + j;
-            if (cur_elem >= tot_elems)
-                break;
-            
-            auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
-            auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
-
-            if (to_gpu)
-                for (size_t k = 0; k < ks.num_bf; j++)
-                    out_field(gpu_base + k) = in_field(cpu_base + k);
-            else
-                for (size_t k = 0; k < ks.num_bf; j++)
-                    out_field(cpu_base + k) = in_field(gpu_base + k);
-        }
-    }
-}
 
+void
+gpu_compute_field_derivatives(entity_data_gpu& edg,
+    const double* F, double *dF_dx, double* dF_dy, double* dF_dz);
diff --git a/src/entity_data.cpp b/src/entity_data.cpp
index 910445e3b3014825c211d25b4391b5b64d579338..fc9c7edbe732085a57a6603d87560177f9245e31 100644
--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -2,7 +2,7 @@
 
 entity_data_gpu::entity_data_gpu()
 {
-    num_elems = nullptr;
+    element_orientation = nullptr;
     differentiation_matrices = nullptr;
     jacobians = nullptr;
     cell_determinants = nullptr;
@@ -13,8 +13,18 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
     g_order = ed.g_order;
     a_order = ed.a_order;
 
-    gpu_copyin(ed.num_elems, &num_elems);
-    num_orientations = ed.num_orientations;
+    num_all_elems = num_elems_all_orientations(ed);
+
+    std::vector<size_t> eo;
+    eo.reserve(num_all_elems);
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        for (size_t iT = 0; iT < ed.num_elems[iO]; iT++)
+            eo.push_back(iO);
+    }
+    assert(eo.size() == num_all_elems);
+    gpu_copyin(eo, &element_orientation);
+
     num_bf = ed.num_bf;
 
     dof_base = 0;
@@ -35,9 +45,10 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
 
 entity_data_gpu::~entity_data_gpu()
 {
-    if (num_elems)
-        checkGPU( gpuFree(num_elems) );
-    
+    /*
+    if (element_orientation)
+        checkGPU( gpuFree(element_orientation) );
+
     if (differentiation_matrices)
         checkGPU( gpuFree(differentiation_matrices) );
     
@@ -46,5 +57,6 @@ entity_data_gpu::~entity_data_gpu()
 
     if (cell_determinants)
         checkGPU( gpuFree(cell_determinants) );
+    */
 }
 
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0b131ae7c4450ee7b40c117667b2c04f7367a800
--- /dev/null
+++ b/src/kernels_cuda.cu
@@ -0,0 +1,125 @@
+#include "entity_data.h"
+#include "kernels_gpu.h"
+
+template<size_t K>
+__global__ void
+gpu_compute_field_derivatives_kernel_planar(entity_data_gpu edg,
+    const double* F, double *dF_dx, double* dF_dy, double* dF_dz)
+{
+    using KS = kernel_gpu_sizes<K>;
+
+    int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
+
+    int32_t cur_dof_offset = edg.dof_base + ofs_in_entity;
+    if (cur_dof_offset >= edg.num_all_elems*edg.num_bf)
+        return;
+
+    int32_t iT = cur_dof_offset / KS::num_bf;
+    int32_t elem_dof_base = edg.dof_base + iT*edg.num_bf;
+    int32_t iO = edg.element_orientation[iT];
+    int32_t DM_row = ofs_in_entity % KS::num_bf;
+    int32_t DM_orient = 3*KS::num_bf*iO*KS::num_bf;
+    double *DM = edg.differentiation_matrices + DM_orient;
+
+    double accm_dF_dx = 0.0;
+    double accm_dF_dy = 0.0;
+    double accm_dF_dz = 0.0;
+
+    int32_t jac_ofs = 9*iT;
+    double J[9];
+    for (int32_t i = 0; i < 9; i++)
+        J[i] = edg.jacobians[jac_ofs+i];
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[0] * v;
+        accm_dF_dy += J[3] * v;
+        accm_dF_dz += J[6] * v;
+    }
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + KS::num_bf*KS::num_bf + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[1] * v;
+        accm_dF_dy += J[4] * v;
+        accm_dF_dz += J[7] * v;
+    }
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + 2*KS::num_bf*KS::num_bf + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[2] * v;
+        accm_dF_dy += J[5] * v;
+        accm_dF_dz += J[8] * v;
+    }
+
+    dF_dx[cur_dof_offset] = accm_dF_dx;
+    dF_dy[cur_dof_offset] = accm_dF_dy;
+    dF_dz[cur_dof_offset] = accm_dF_dz;
+}
+
+void
+gpu_compute_field_derivatives(entity_data_gpu& edg,
+    const double* f, double *df_dx, double* df_dy, double* df_dz)
+{
+    const auto THREADS_PER_BLOCK = 256;
+    auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
+    if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
+        num_blocks += 1;
+
+    switch (edg.a_order)
+    {
+        case 1:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
+            break;
+
+        case 2:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<2>(ed, f, df_dx, df_dy, df_dz);
+            break;
+    
+        case 3:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<3>(ed, f, df_dx, df_dy, df_dz);
+            break;
+    #if 0
+        case 4:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<4>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<4>(ed, f, df_dx, df_dy, df_dz);
+            break;
+        
+        case 5:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<5>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<5>(ed, f, df_dx, df_dy, df_dz);
+            break;
+
+        case 6:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<6>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<6>(ed, f, df_dx, df_dy, df_dz);
+            break;
+#endif
+
+        default:
+            std::cout << "compute_field_derivatives: invalid order" << std::endl;
+    }
+}
\ No newline at end of file
diff --git a/src/kernels_gpu_mi.cpp b/src/kernels_gpu_mi.cpp
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..76587a505425ec53cd5d0fbbb0fed2f65bbd2728 100644
--- a/src/kernels_gpu_mi.cpp
+++ b/src/kernels_gpu_mi.cpp
@@ -0,0 +1,54 @@
+#include "kernels_gpu.h"
+
+/* Return how many dblocks are needed to store this entity on GPU. */
+size_t gpu_dblocks(const entity_data_cpu& ed)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+
+    size_t tot_elems = num_elems_all_orientations(ed);
+    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
+    if (num_dblocks % ks.cells_per_dblock)
+        num_dblocks += 1;
+
+    return num_dblocks;
+}
+
+/* Return the total number of dofs (actual + padding) that are contained
+ * in the dblocks making up this entity. */
+size_t gpu_dblocks_dofs(const entity_data_cpu& ed)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+    size_t num_dblocks = gpu_dblocks(ed);
+    return num_dblocks * ks.dblock_size;
+}
+
+void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
+                  const vecxd& in_field, vecxd& out_field, bool to_gpu)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+
+    size_t tot_elems = num_elems_all_orientations(ed);
+    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
+    if (num_dblocks % ks.cells_per_dblock)
+        num_dblocks += 1;
+
+    for (size_t i = 0; i < num_dblocks; i++)
+    {
+        for (size_t j = 0; j < ks.cells_per_dblock; j++)
+        {
+            auto cur_elem =  ks.cells_per_dblock*i + j;
+            if (cur_elem >= tot_elems)
+                break;
+            
+            auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
+            auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
+
+            if (to_gpu)
+                for (size_t k = 0; k < ks.num_bf; k++)
+                    out_field(gpu_base + k) = in_field(cpu_base + k);
+            else
+                for (size_t k = 0; k < ks.num_bf; k++)
+                    out_field(cpu_base + k) = in_field(gpu_base + k);
+        }
+    }
+}
diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp
index c4e65650307234c019ec9412d4822b50fa57aa03..ac01742014f617149c02eee0f5a5d9f20b5d7f9a 100644
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -10,6 +10,8 @@
 #include "kernels_cpu.h"
 #include "timecounter.h"
 
+#include "kernels_gpu.h"
+
 using namespace sgr;
 
 static void
@@ -58,22 +60,34 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
 
         auto& e0 = mod.entity_at(0);
         e0.sort_by_orientation();
-        vecxd Pf_e0 = e0.project(f);
-        vecxd df_dx_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dy_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dz_e0 = vecxd::Zero( Pf_e0.rows() );
+        vecxd Pf_cpu = e0.project(f);
 
+        /* Make CPU entity data */
         entity_data_cpu ed;
         e0.populate_entity_data(ed);
-
+        /* Derive GPU entity data */
         entity_data_gpu edg(ed);
 
-#if 0
-        timecounter tc;
+        /* Prepare I/O vectors and call kernel */
+        device_vector<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size());
+        device_vector<double> df_dx_gpu(Pf_cpu.size());
+        device_vector<double> df_dy_gpu(Pf_cpu.size());
+        device_vector<double> df_dz_gpu(Pf_cpu.size());
+
+        timecounter_gpu tc;
         tc.tic();
-        compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
+        gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(),
+            df_dy_gpu.data(), df_dz_gpu.data());
         double time = tc.toc();
 
+        vecxd df_dx_cpu = vecxd::Zero( Pf_cpu.size() );
+        vecxd df_dy_cpu = vecxd::Zero( Pf_cpu.size() );
+        vecxd df_dz_cpu = vecxd::Zero( Pf_cpu.size() );
+
+        df_dx_gpu.copyout( df_dx_cpu.data() );
+        df_dy_gpu.copyout( df_dy_cpu.data() );
+        df_dz_gpu.copyout( df_dz_cpu.data() );
+
         if (geometric_order == 1)
         {
             std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
@@ -101,9 +115,9 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
             auto& re = e0.cell_refelem(pe);
             matxd mass = e0.mass_matrix(iT);
             auto num_bf = re.num_basis_functions();
-            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_e0.segment(iT*num_bf, num_bf); 
+            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_cpu.segment(iT*num_bf, num_bf); 
+            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_cpu.segment(iT*num_bf, num_bf); 
+            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_cpu.segment(iT*num_bf, num_bf); 
         
             err_x += diff_x.dot(mass*diff_x);
             err_y += diff_y.dot(mass*diff_y);
@@ -115,14 +129,12 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         errs_x.push_back( std::sqrt(err_x) );
         errs_y.push_back( std::sqrt(err_y) );
         errs_z.push_back( std::sqrt(err_z) );
-#endif
     }
 
     double rate_x = 0.0;
     double rate_y = 0.0;
     double rate_z = 0.0;
 
-#if 0
     std::cout << Byellowfg << "rate df/dx  rate df/dy  rate df/dz" << reset << std::endl;
     for (size_t i = 1; i < sizes.size(); i++)
     {
@@ -134,7 +146,6 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
     COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2);
     COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2);
     COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2);
-#endif
 
     return 0;
 }
@@ -150,8 +161,8 @@ int main(void)
     int failed_tests = 0;
 
     std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
-    for (size_t go = 1; go < 5; go++)
-        for (size_t ao = go; ao < 5; ao++)
+    for (size_t go = 1; go < 2; go++)
+        for (size_t ao = go; ao < 4; ao++)
             failed_tests += test_differentiation_convergence(go, ao);
 
     return failed_tests;