diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..83f93b54b75bc9c0448eec2acc5bc1ac97f3e56f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,16 @@
+CMakeLists.txt.user
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Testing
+Makefile
+cmake_install.cmake
+install_manifest.txt
+compile_commands.json
+CTestTestfile.cmake
+_deps
+*.out
+
+build
+share/**/*.silo
+.vscode
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f7df4acd36e3f181435d255232d5886b7bdde42..49827e51bfaeda58908f547245aba8634cfd5175 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -281,6 +281,12 @@ option(OPT_EXPENSIVE_ASSERTS "Enable expensive assert()s" OFF)
 if (OPT_EXPENSIVE_ASSERTS)
     add_definitions(-DEXPENSIVE_ASSERTS)
 endif()
+
+option(NEW_CURLS "Use the new method for computing curls" OFF)
+if (NEW_CURLS)
+    add_compile_definitions(NEW_CURLS)
+endif()
+
 ######################################################################
 ## Enable/disable solver modules
 option(OPT_DEBUG_PRINT "Print debug information")
@@ -317,6 +323,9 @@ if (OPT_BUILD_TESTS)
     add_executable(test_mass tests/test_mass.cpp)
     target_link_libraries(test_mass gmshdg ${LINK_LIBS})
 
+    add_executable(test_curl tests/test_curl.cpp)
+    target_link_libraries(test_curl gmshdg ${LINK_LIBS})
+
     add_executable(test_differentiation tests/test_differentiation.cpp)
     target_link_libraries(test_differentiation gmshdg ${LINK_LIBS})
 
@@ -324,12 +333,18 @@ if (OPT_BUILD_TESTS)
     target_link_libraries(test_lifting gmshdg ${LINK_LIBS})
 
     if (OPT_ENABLE_GPU_SOLVER)
+        add_executable(test_curl_gpu tests/test_curl_gpu.cpp)
+        target_link_libraries(test_curl_gpu gmshdg ${LINK_LIBS})
+
         add_executable(test_differentiation_gpu tests/test_differentiation_gpu.cpp)
         target_link_libraries(test_differentiation_gpu gmshdg ${LINK_LIBS})
 
         add_executable(profile_differentiation_gpu tests/profile_differentiation_gpu.cpp)
         target_link_libraries(profile_differentiation_gpu gmshdg ${LINK_LIBS})
 
+        add_executable(profile_curl_gpu tests/profile_curl_gpu.cpp)
+        target_link_libraries(profile_curl_gpu gmshdg ${LINK_LIBS})
+
         add_executable(test_lifting_gpu tests/test_lifting_gpu.cpp)
         target_link_libraries(test_lifting_gpu gmshdg ${LINK_LIBS})
     endif()
diff --git a/include/libgmshdg/kernels_cpu.h b/include/libgmshdg/kernels_cpu.h
index fe14d5572715e7643a0a137c8b6e78557c88db2f..813418ce1dbdcbe80a70fd045d9f99abb898d516 100644
--- a/include/libgmshdg/kernels_cpu.h
+++ b/include/libgmshdg/kernels_cpu.h
@@ -19,6 +19,450 @@ struct kernel_cpu_sizes
     static const size_t num_all_fluxes = num_fluxes * num_faces;
 };
 
+template<size_t AK>
+void compute_curl_kernel_planar(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha,
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz)
+{
+    /* Flop count: (45*ed.num_bf + 6)*ed.num_bf*num_total_elems */
+    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
+
+    assert(ed.g_order == 1);
+
+    using KS = kernel_cpu_sizes<AK>;
+    assert(ed.num_bf == KS::num_bf);
+
+    size_t orient_elem_base = 0;
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        size_t num_elems = ed.num_elems[iO];
+        size_t orient_base = orient_elem_base * ed.num_bf;
+
+        #pragma omp parallel for //num_threads(2)
+        for (size_t iT = 0; iT < num_elems; iT++)
+        {
+            size_t ent_iT = orient_elem_base + iT;
+            size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf;
+            
+            for (size_t odof = 0; odof < KS::num_bf; odof++)
+            {
+                double acc_dfx_dy = 0.0;
+                double acc_dfx_dz = 0.0;
+
+                double acc_dfy_dx = 0.0;
+                double acc_dfy_dz = 0.0;
+
+                double acc_dfz_dx = 0.0;
+                double acc_dfz_dy = 0.0;
+
+                for (size_t idof = 0; idof < KS::num_bf; idof++)
+                {
+                    auto dm_row = iO*KS::num_bf + odof;
+                    auto dm_col0 = 0*KS::num_bf + idof;
+                    auto dm_col1 = 1*KS::num_bf + idof;
+                    auto dm_col2 = 2*KS::num_bf + idof;
+
+                    auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0);
+                    auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1);
+                    auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2);
+                    
+                    auto jbase = 3*ent_iT;
+
+                    double Fx = fx(dofofs+idof);
+                    double v0_fx = dm_0 * Fx;
+                    double v1_fx = dm_1 * Fx;
+                    double v2_fx = dm_2 * Fx;
+                    acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx 
+                        + ed.jacobians(jbase + 1, 1) * v1_fx 
+                        + ed.jacobians(jbase + 1, 2) * v2_fx);
+                    acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx
+                        + ed.jacobians(jbase + 2, 1) * v1_fx
+                        + ed.jacobians(jbase + 2, 2) * v2_fx);
+                    
+                    double Fy = fy(dofofs+idof);
+                    double v0_fy = dm_0 * Fy;
+                    double v1_fy = dm_1 * Fy;
+                    double v2_fy = dm_2 * Fy;
+                    acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy
+                        + ed.jacobians(jbase + 0, 1) * v1_fy
+                        + ed.jacobians(jbase + 0, 2) * v2_fy);
+                    acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy
+                        + ed.jacobians(jbase + 2, 1) * v1_fy
+                        + ed.jacobians(jbase + 2, 2) * v2_fy);
+                    
+                    double Fz = fz(dofofs+idof);
+                    double v0_fz = dm_0 * Fz;
+                    double v1_fz = dm_1 * Fz;
+                    double v2_fz = dm_2 * Fz;
+                    acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz
+                        + ed.jacobians(jbase + 0, 1) * v1_fz
+                        + ed.jacobians(jbase + 0, 2) * v2_fz);
+                    acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz
+                        + ed.jacobians(jbase + 1, 1) * v1_fz
+                        + ed.jacobians(jbase + 1, 2) * v2_fz);
+                    
+                }
+
+                curl_fx(dofofs+odof) = alpha * (acc_dfz_dy - acc_dfy_dz);
+                curl_fy(dofofs+odof) = alpha * (acc_dfx_dz - acc_dfz_dx);
+                curl_fz(dofofs+odof) = alpha * (acc_dfy_dx - acc_dfx_dy);
+
+            }
+        }
+
+        orient_elem_base += num_elems;
+    }
+}
+
+template<size_t AK>
+void compute_curl_kernel_curved(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha,
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz)
+{
+    /* Flop count: ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + 6*((2*ed.num_bf-1) + 1)*ed.num_bf)*num_total_elems */
+    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
+
+    assert(ed.g_order == 1);
+
+    using KS = kernel_cpu_sizes<AK>;
+    assert(ed.num_bf == KS::num_bf);
+
+    size_t orient_elem_base = 0;
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        size_t num_elems = ed.num_elems[iO];
+        size_t orient_base = orient_elem_base * ed.num_bf;
+
+        #pragma omp parallel for //num_threads(2)
+        for (size_t iT = 0; iT < num_elems; iT++)
+        {
+            size_t ent_iT = orient_elem_base + iT;
+            size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf;
+
+            vecxd dfx_dy = vecxd::Zero(ed.num_bf);
+            vecxd dfx_dz = vecxd::Zero(ed.num_bf);
+
+            vecxd dfy_dx = vecxd::Zero(ed.num_bf);
+            vecxd dfy_dz = vecxd::Zero(ed.num_bf);
+
+            vecxd dfz_dx = vecxd::Zero(ed.num_bf);
+            vecxd dfz_dy = vecxd::Zero(ed.num_bf);
+
+            for (size_t iQp = 0; iQp < ed.num_qp; iQp++)
+            {
+                auto det = ed.cell_determinants(ent_iT * ed.num_qp + iQp);
+
+                for (size_t odof = 0; odof < KS::num_bf; odof++)
+                {
+                    double acc_dfx_dy = 0.0;
+                    double acc_dfx_dz = 0.0;
+
+                    double acc_dfy_dx = 0.0;
+                    double acc_dfy_dz = 0.0;
+
+                    double acc_dfz_dx = 0.0;
+                    double acc_dfz_dy = 0.0;
+
+                    for (size_t idof = 0; idof < KS::num_bf; idof++)
+                    {
+                        auto dm_row = iO * KS::num_bf * ed.num_qp + KS::num_bf * iQp + odof;
+                        auto dm_col0 = 0 * KS::num_bf + idof;
+                        auto dm_col1 = 1 * KS::num_bf + idof;
+                        auto dm_col2 = 2 * KS::num_bf + idof;
+
+                        auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0);
+                        auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1);
+                        auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2);
+
+                        auto jbase = 3 * ent_iT * ed.num_qp + 3 * iQp;
+
+                        double Fx = fx(dofofs+idof);
+                        double v0_fx = dm_0 * Fx;
+                        double v1_fx = dm_1 * Fx;
+                        double v2_fx = dm_2 * Fx;
+                        acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx 
+                            + ed.jacobians(jbase + 1, 1) * v1_fx 
+                            + ed.jacobians(jbase + 1, 2) * v2_fx);
+                        acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx
+                            + ed.jacobians(jbase + 2, 1) * v1_fx
+                            + ed.jacobians(jbase + 2, 2) * v2_fx);
+                        
+                        double Fy = fy(dofofs+idof);
+                        double v0_fy = dm_0 * Fy;
+                        double v1_fy = dm_1 * Fy;
+                        double v2_fy = dm_2 * Fy;
+                        acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy
+                            + ed.jacobians(jbase + 0, 1) * v1_fy
+                            + ed.jacobians(jbase + 0, 2) * v2_fy);
+                        acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy
+                            + ed.jacobians(jbase + 2, 1) * v1_fy
+                            + ed.jacobians(jbase + 2, 2) * v2_fy);
+                        
+                        double Fz = fz(dofofs+idof);
+                        double v0_fz = dm_0 * Fz;
+                        double v1_fz = dm_1 * Fz;
+                        double v2_fz = dm_2 * Fz;
+                        acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz
+                            + ed.jacobians(jbase + 0, 1) * v1_fz
+                            + ed.jacobians(jbase + 0, 2) * v2_fz);
+                        acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz
+                            + ed.jacobians(jbase + 1, 1) * v1_fz
+                            + ed.jacobians(jbase + 1, 2) * v2_fz);
+                    }
+
+                    dfx_dy(odof) += det * acc_dfx_dy; 
+                    dfx_dz(odof) += det * acc_dfx_dz;
+
+                    dfy_dx(odof) += det * acc_dfy_dx;
+                    dfy_dz(odof) += det * acc_dfy_dz;
+
+                    dfz_dx(odof) += det * acc_dfz_dx;
+                    dfz_dy(odof) += det * acc_dfz_dy;
+
+                }
+            }
+
+            dfx_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dy;
+            dfx_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dz;
+
+            dfy_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dx;
+            dfy_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dz;
+            
+            dfz_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dx;
+            dfz_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dy;
+
+            curl_fx.segment(dofofs, ed.num_bf) = alpha * (dfz_dy - dfy_dz);
+            curl_fy.segment(dofofs, ed.num_bf) = alpha * (dfx_dz - dfz_dx);
+            curl_fz.segment(dofofs, ed.num_bf) = alpha * (dfy_dx - dfx_dy);
+        }
+
+        orient_elem_base += num_elems;
+    }
+}
+
+template<size_t AK>
+void compute_curl_kernel_planar(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha,
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz,
+    const vecxd& jx, const vecxd& jy, const vecxd& jz)
+{
+    /* Flop count: (45*ed.num_bf + 9)*ed.num_bf*num_total_elems */
+    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
+
+    assert(ed.g_order == 1);
+
+    using KS = kernel_cpu_sizes<AK>;
+    assert(ed.num_bf == KS::num_bf);
+
+    size_t orient_elem_base = 0;
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        size_t num_elems = ed.num_elems[iO];
+        size_t orient_base = orient_elem_base * ed.num_bf;
+
+        #pragma omp parallel for //num_threads(2)
+        for (size_t iT = 0; iT < num_elems; iT++)
+        {
+            size_t ent_iT = orient_elem_base + iT;
+            size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf;
+            
+            for (size_t odof = 0; odof < KS::num_bf; odof++)
+            {
+                double acc_dfx_dy = 0.0;
+                double acc_dfx_dz = 0.0;
+
+                double acc_dfy_dx = 0.0;
+                double acc_dfy_dz = 0.0;
+
+                double acc_dfz_dx = 0.0;
+                double acc_dfz_dy = 0.0;
+
+                for (size_t idof = 0; idof < KS::num_bf; idof++)
+                {
+                    auto dm_row = iO*KS::num_bf + odof;
+                    auto dm_col0 = 0*KS::num_bf + idof;
+                    auto dm_col1 = 1*KS::num_bf + idof;
+                    auto dm_col2 = 2*KS::num_bf + idof;
+
+                    auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0);
+                    auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1);
+                    auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2);
+                    
+                    auto jbase = 3*ent_iT;
+
+                    double Fx = fx(dofofs+idof);
+                    double v0_fx = dm_0 * Fx;
+                    double v1_fx = dm_1 * Fx;
+                    double v2_fx = dm_2 * Fx;
+                    acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx 
+                        + ed.jacobians(jbase + 1, 1) * v1_fx 
+                        + ed.jacobians(jbase + 1, 2) * v2_fx);
+                    acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx
+                        + ed.jacobians(jbase + 2, 1) * v1_fx
+                        + ed.jacobians(jbase + 2, 2) * v2_fx);
+                    
+                    double Fy = fy(dofofs+idof);
+                    double v0_fy = dm_0 * Fy;
+                    double v1_fy = dm_1 * Fy;
+                    double v2_fy = dm_2 * Fy;
+                    acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy
+                        + ed.jacobians(jbase + 0, 1) * v1_fy
+                        + ed.jacobians(jbase + 0, 2) * v2_fy);
+                    acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy
+                        + ed.jacobians(jbase + 2, 1) * v1_fy
+                        + ed.jacobians(jbase + 2, 2) * v2_fy);
+                    
+                    double Fz = fz(dofofs+idof);
+                    double v0_fz = dm_0 * Fz;
+                    double v1_fz = dm_1 * Fz;
+                    double v2_fz = dm_2 * Fz;
+                    acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz
+                        + ed.jacobians(jbase + 0, 1) * v1_fz
+                        + ed.jacobians(jbase + 0, 2) * v2_fz);
+                    acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz
+                        + ed.jacobians(jbase + 1, 1) * v1_fz
+                        + ed.jacobians(jbase + 1, 2) * v2_fz);
+                    
+                }
+
+                curl_fx(dofofs+odof) = alpha * (acc_dfz_dy - acc_dfy_dz) - jx(dofofs+odof);
+                curl_fy(dofofs+odof) = alpha * (acc_dfx_dz - acc_dfz_dx) - jy(dofofs+odof);
+                curl_fz(dofofs+odof) = alpha * (acc_dfy_dx - acc_dfx_dy) - jz(dofofs+odof);
+
+            }
+        }
+
+        orient_elem_base += num_elems;
+    }
+}
+
+template<size_t AK>
+void compute_curl_kernel_curved(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, 
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz,
+    const vecxd& jx, const vecxd& jy, const vecxd& jz)
+{
+    /* Flop count: ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + ((12*ed.num_bf-1) + 9)*ed.num_bf)*num_total_elems */
+    /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
+
+    assert(ed.g_order == 1);
+
+    using KS = kernel_cpu_sizes<AK>;
+    assert(ed.num_bf == KS::num_bf);
+
+    size_t orient_elem_base = 0;
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        size_t num_elems = ed.num_elems[iO];
+        size_t orient_base = orient_elem_base * ed.num_bf;
+
+        #pragma omp parallel for //num_threads(2)
+        for (size_t iT = 0; iT < num_elems; iT++)
+        {
+            size_t ent_iT = orient_elem_base + iT;
+            size_t dofofs = ed.dof_base + orient_base + iT*ed.num_bf;
+
+            vecxd dfx_dy = vecxd::Zero(ed.num_bf);
+            vecxd dfx_dz = vecxd::Zero(ed.num_bf);
+
+            vecxd dfy_dx = vecxd::Zero(ed.num_bf);
+            vecxd dfy_dz = vecxd::Zero(ed.num_bf);
+
+            vecxd dfz_dx = vecxd::Zero(ed.num_bf);
+            vecxd dfz_dy = vecxd::Zero(ed.num_bf);
+
+            for (size_t iQp = 0; iQp < ed.num_qp; iQp++)
+            {
+                auto det = ed.cell_determinants(ent_iT * ed.num_qp + iQp);
+
+                for (size_t odof = 0; odof < KS::num_bf; odof++)
+                {
+                    double acc_dfx_dy = 0.0;
+                    double acc_dfx_dz = 0.0;
+
+                    double acc_dfy_dx = 0.0;
+                    double acc_dfy_dz = 0.0;
+
+                    double acc_dfz_dx = 0.0;
+                    double acc_dfz_dy = 0.0;
+
+                    for (size_t idof = 0; idof < KS::num_bf; idof++)
+                    {
+                        auto dm_row = iO * KS::num_bf * ed.num_qp + KS::num_bf * iQp + odof;
+                        auto dm_col0 = 0 * KS::num_bf + idof;
+                        auto dm_col1 = 1 * KS::num_bf + idof;
+                        auto dm_col2 = 2 * KS::num_bf + idof;
+
+                        auto dm_0 = ed.differentiation_matrices(dm_row, dm_col0);
+                        auto dm_1 = ed.differentiation_matrices(dm_row, dm_col1);
+                        auto dm_2 = ed.differentiation_matrices(dm_row, dm_col2);
+
+                        auto jbase = 3 * ent_iT * ed.num_qp + 3 * iQp;
+
+                        double Fx = fx(dofofs+idof);
+                        double v0_fx = dm_0 * Fx;
+                        double v1_fx = dm_1 * Fx;
+                        double v2_fx = dm_2 * Fx;
+                        acc_dfx_dy += (ed.jacobians(jbase + 1, 0) * v0_fx 
+                            + ed.jacobians(jbase + 1, 1) * v1_fx 
+                            + ed.jacobians(jbase + 1, 2) * v2_fx);
+                        acc_dfx_dz += (ed.jacobians(jbase + 2, 0) * v0_fx
+                            + ed.jacobians(jbase + 2, 1) * v1_fx
+                            + ed.jacobians(jbase + 2, 2) * v2_fx);
+                        
+                        double Fy = fy(dofofs+idof);
+                        double v0_fy = dm_0 * Fy;
+                        double v1_fy = dm_1 * Fy;
+                        double v2_fy = dm_2 * Fy;
+                        acc_dfy_dx += (ed.jacobians(jbase + 0, 0) * v0_fy
+                            + ed.jacobians(jbase + 0, 1) * v1_fy
+                            + ed.jacobians(jbase + 0, 2) * v2_fy);
+                        acc_dfy_dz += (ed.jacobians(jbase + 2, 0) * v0_fy
+                            + ed.jacobians(jbase + 2, 1) * v1_fy
+                            + ed.jacobians(jbase + 2, 2) * v2_fy);
+                        
+                        double Fz = fz(dofofs+idof);
+                        double v0_fz = dm_0 * Fz;
+                        double v1_fz = dm_1 * Fz;
+                        double v2_fz = dm_2 * Fz;
+                        acc_dfz_dx += (ed.jacobians(jbase + 0, 0) * v0_fz
+                            + ed.jacobians(jbase + 0, 1) * v1_fz
+                            + ed.jacobians(jbase + 0, 2) * v2_fz);
+                        acc_dfz_dy += (ed.jacobians(jbase + 1, 0) * v0_fz
+                            + ed.jacobians(jbase + 1, 1) * v1_fz
+                            + ed.jacobians(jbase + 1, 2) * v2_fz);
+                    }
+
+                    dfx_dy(odof) += det * acc_dfx_dy; 
+                    dfx_dz(odof) += det * acc_dfx_dz;
+
+                    dfy_dx(odof) += det * acc_dfy_dx;
+                    dfy_dz(odof) += det * acc_dfy_dz;
+
+                    dfz_dx(odof) += det * acc_dfz_dx;
+                    dfz_dy(odof) += det * acc_dfz_dy;
+
+                }
+            }
+
+            dfx_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dy;
+            dfx_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfx_dz;
+
+            dfy_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dx;
+            dfy_dz = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfy_dz;
+            
+            dfz_dx = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dx;
+            dfz_dy = ed.invmass_matrices.block(ent_iT*ed.num_bf, 0, ed.num_bf, ed.num_bf) * dfz_dy;
+
+            curl_fx.segment(dofofs, ed.num_bf) = alpha * (dfz_dy - dfy_dz) - jx.segment(dofofs, ed.num_bf);
+            curl_fy.segment(dofofs, ed.num_bf) = alpha * (dfx_dz - dfz_dx) - jy.segment(dofofs, ed.num_bf);
+            curl_fz.segment(dofofs, ed.num_bf) = alpha * (dfy_dx - dfx_dy) - jz.segment(dofofs, ed.num_bf);
+        }
+
+        orient_elem_base += num_elems;
+    }
+}
+
 template<size_t AK>
 void compute_field_derivatives_kernel_planar(const entity_data_cpu& ed,
     const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
@@ -87,7 +531,7 @@ template<size_t AK>
 void compute_field_derivatives_kernel_curved(const entity_data_cpu& ed,
     const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
 {
-    /* Flop count: ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_elements */
+    /* Flop count: ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_total_elems */
     /* The elements must be ordered by orientation -> entity::sort_by_orientation() */
 
     assert(ed.g_order > 1);
@@ -168,10 +612,16 @@ void compute_field_derivatives_kernel_curved(const entity_data_cpu& ed,
 }
 
 
+void compute_field_curl(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz);
+
+void compute_field_curl(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha, vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz,
+    const vecxd& jx, const vecxd& jy, const vecxd& jz);
+
 void compute_field_derivatives(const entity_data_cpu& ed,
     const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz);
 
-
 template<size_t AK>
 void compute_lifting_kernel_planar(const entity_data_cpu& ed,
     const vecxd& flux, vecxd& field)
diff --git a/include/libgmshdg/kernels_gpu.h b/include/libgmshdg/kernels_gpu.h
index 3f5c216dd8ccb10cc76cbab931074e738628b8f2..eef7f5345a2e41069dcf6900ceb88d454321e799 100644
--- a/include/libgmshdg/kernels_gpu.h
+++ b/include/libgmshdg/kernels_gpu.h
@@ -21,6 +21,7 @@ struct kernel_gpu_sizes<1>
     static const size_t num_bf              = num_dofs_3D(1); //4
     static const size_t num_fluxes          = num_dofs_2D(1); //3
 
+    static const size_t curl_threads        = 512;
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
 
@@ -36,6 +37,7 @@ struct kernel_gpu_sizes<2>
     static const size_t num_bf              = num_dofs_3D(2); //10
     static const size_t num_fluxes          = num_dofs_2D(2); //6
 
+    static const size_t curl_threads        = 128;
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
 
@@ -51,6 +53,7 @@ struct kernel_gpu_sizes<3>
     static const size_t num_bf              = num_dofs_3D(3); //20
     static const size_t num_fluxes          = num_dofs_2D(3); //10
 
+    static const size_t curl_threads        = 128;
     static const size_t deriv_threads       = 128;
     static const size_t lifting_threads     = 128;
 
@@ -66,6 +69,7 @@ struct kernel_gpu_sizes<4>
     static const size_t num_bf              = num_dofs_3D(4); //35
     static const size_t num_fluxes          = num_dofs_2D(4);
 
+    static const size_t curl_threads        = 128;
     static const size_t deriv_threads       = 512;
     static const size_t lifting_threads     = 128;
 
@@ -81,10 +85,11 @@ struct kernel_gpu_sizes<5>
     static const size_t num_bf              = num_dofs_3D(5);
     static const size_t num_fluxes          = num_dofs_2D(5);
 
-    static const size_t deriv_threads       = 1024;
+    static const size_t curl_threads        = 64;
+    static const size_t deriv_threads       = 512;
     static const size_t lifting_threads     = 128;
 
-        static const size_t cells_per_dblock    = 32;
+    static const size_t cells_per_dblock    = 32;
     static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
     static const size_t dblock_size         = 128;
     static const size_t parallel_dblocks    = 4;
@@ -96,10 +101,11 @@ struct kernel_gpu_sizes<6>
     static const size_t num_bf              = num_dofs_3D(6);
     static const size_t num_fluxes          = num_dofs_2D(6);
 
-    static const size_t deriv_threads       = 1024;
+    static const size_t curl_threads        = 64;
+    static const size_t deriv_threads       = 512;
     static const size_t lifting_threads     = 128;
 
-        static const size_t cells_per_dblock    = 32;
+    static const size_t cells_per_dblock    = 32;
     static const size_t dofs_per_dblock     = num_bf * cells_per_dblock;
     static const size_t dblock_size         = 128;
     static const size_t parallel_dblocks    = 4;
@@ -148,6 +154,18 @@ size_t gpu_dblocks_dofs(const entity_data_cpu&);
 #define FROM_GPU    false
 void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, vecxd&, bool);
 
+void
+gpu_compute_field_curl(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    gpuStream_t stream = 0);
+
+void
+gpu_compute_field_curl(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    double *src_x, double *src_y, double *src_z,
+    gpuStream_t stream = 0);
 
 void
 gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F,
diff --git a/include/maxwell/maxwell_interface.h b/include/maxwell/maxwell_interface.h
index dd7900d54c475a5b50b66b8fdef1612e460f65ed..943a2853919855783050155310a58aa48163a661 100644
--- a/include/maxwell/maxwell_interface.h
+++ b/include/maxwell/maxwell_interface.h
@@ -301,9 +301,11 @@ struct solver_state_gpu
     double                              curr_time;
     size_t                              curr_timestep;
 
+#ifndef NEW_CURLS
     field_gpu                           dx;
     field_gpu                           dy;
     field_gpu                           dz;
+#endif
 
     field_gpu                           k1;
     field_gpu                           k2;
diff --git a/share/validation/params_test_maxwell_resonator.lua b/share/validation/params_test_maxwell_resonator.lua
index 6943e4b69511e79dc8d185641bcdb3da16f62f43..f6c513d9e4a32cd747a03f81bc79aa834b3dbd55 100644
--- a/share/validation/params_test_maxwell_resonator.lua
+++ b/share/validation/params_test_maxwell_resonator.lua
@@ -4,10 +4,10 @@
 --[[ Problem setup ]]--
 sim.name = "test_maxwell_resonator"             -- simulation name
 sim.dt = 1e-12                                  -- timestep size
-sim.timesteps = 5001                           -- num of iterations
+sim.timesteps = 501                           -- num of iterations
 sim.gmsh_model = "resonator.geo"                -- gmsh model filename
-sim.use_gpu = 0                                 -- 0: cpu, 1: gpu
-sim.approx_order = 1                            -- approximation order
+sim.use_gpu = 1                                 -- 0: cpu, 1: gpu
+sim.approx_order = 3                            -- approximation order
 sim.time_integrator = "leapfrog"
 postpro.silo_output_rate = 100
 postpro.cycle_print_rate = 100                  -- console print rate
@@ -87,22 +87,22 @@ end
 debug.analytical_solution = ansol
 --debug.dump_cell_ranks = true
 
-if ( do_IO ) then
-    fh = io.open("energy.txt", "w")
-end
-
-function on_timestep(ts)
-    local e = compute_energy()
-    local err = compute_error()
-    if ( do_IO ) then
-        local tot_energy = e.Ex + e.Ey + e.Ez + e.Hx + e.Hy + e.Hz
-        local energy_err = 100*(tot_energy - 2*We)/(2*We)
-        fh:write(energy_err, " ", tot_energy, " ", e.Ey, " ", e.Hx, " ", e.Hz, "\n")
-    end
-end
-
-function on_exit()
-    if ( do_IO ) then
-        fh:close()
-    end
-end
+--if ( do_IO ) then
+--    fh = io.open("energy.txt", "w")
+--end
+
+--function on_timestep(ts)
+    --local e = compute_energy()
+    --local err = compute_error()
+    --if ( do_IO ) then
+    --    local tot_energy = e.Ex + e.Ey + e.Ez + e.Hx + e.Hy + e.Hz
+    --    local energy_err = 100*(tot_energy - 2*We)/(2*We)
+    --    fh:write(energy_err, " ", tot_energy, " ", e.Ey, " ", e.Hx, " ", e.Hz, "\n")
+    --end
+--end
+
+--function on_exit()
+--    if ( do_IO ) then
+--        fh:close()
+--    end
+--end
diff --git a/src/libgmshdg/kernels_cpu.cpp b/src/libgmshdg/kernels_cpu.cpp
index a1a2ecfebaa3e40a2dfe26afa2d84289d0f4a300..72231d23dcc4a83bbbb6708ef75a5e60d945274b 100644
--- a/src/libgmshdg/kernels_cpu.cpp
+++ b/src/libgmshdg/kernels_cpu.cpp
@@ -10,6 +10,113 @@
 #include "libgmshdg/entity_data.h"
 #include "libgmshdg/kernels_cpu.h"
 
+void compute_field_curl(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha,
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz)
+{
+    switch (ed.a_order)
+    {
+        case 1:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+
+        case 2:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+
+        case 3:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+
+        case 4:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+        
+        case 5:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+
+        case 6:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            else
+                compute_curl_kernel_curved<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz);
+            break;
+
+        default:
+            std::cout << "compute_curl: invalid order" << std::endl;
+    }
+}
+
+void compute_field_curl(const entity_data_cpu& ed,
+    const vecxd& fx, const vecxd& fy, const vecxd& fz, double alpha,
+    vecxd& curl_fx, vecxd& curl_fy, vecxd& curl_fz,
+    const vecxd& jx, const vecxd& jy, const vecxd& jz)
+{
+    switch (ed.a_order)
+    {
+        case 1:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<1>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+
+        case 2:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<2>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+
+        case 3:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<3>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+
+        case 4:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<4>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+        
+        case 5:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<5>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+
+        case 6:
+            if (ed.g_order == 1)
+                compute_curl_kernel_planar<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            else
+                compute_curl_kernel_curved<6>(ed, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, jx, jy, jz);
+            break;
+
+        default:
+            std::cout << "compute_curl_sources: invalid order" << std::endl;
+    }
+}
+
 /* Field derivatives kernel, case for elements with planar faces */
 void compute_field_derivatives(const entity_data_cpu& ed,
     const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
diff --git a/src/libgmshdg/kernels_cuda.cu b/src/libgmshdg/kernels_cuda.cu
index ac9b391ced4ba938772f9f9213583150cfaefc86..78a3c34d5da675ca5b920601e277444ed4cf9780 100644
--- a/src/libgmshdg/kernels_cuda.cu
+++ b/src/libgmshdg/kernels_cuda.cu
@@ -9,6 +9,147 @@
 #include "libgmshdg/entity_data.h"
 #include "libgmshdg/kernels_gpu.h"
 
+template<size_t K>
+__global__ void
+gpu_curl_planar(const double *fx, const double *fy, const double *fz,
+    const double * __restrict__ J, gpuTextureObject_t DM_tex, double alpha,
+    double * __restrict__ curl_fx, double * __restrict__ curl_fy, double * __restrict__ curl_fz,
+    int32_t num_all_elems, const int32_t* orients, int32_t dof_base)
+{
+    using KS = kernel_gpu_sizes<K>;
+
+    const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (ofs_in_entity >= num_all_elems*KS::num_bf)
+        return;
+
+    const int32_t output_dof_offset = dof_base + ofs_in_entity;
+
+    const int32_t iT = ofs_in_entity / KS::num_bf;
+    const int32_t elem_dof_base = dof_base + iT*KS::num_bf;
+    const int32_t iO = orients[iT];
+    const int32_t DM_row = ofs_in_entity % KS::num_bf;
+    const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t DM_base = DM_orient + DM_row;
+    const int32_t jac_ofs = 9*iT;
+
+    double acc_dfx_dy = 0.0;
+    double acc_dfx_dz = 0.0;
+
+    double acc_dfy_dx = 0.0;
+    double acc_dfy_dz = 0.0;
+
+    double acc_dfz_dx = 0.0;
+    double acc_dfz_dy = 0.0;
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t f_ofs = elem_dof_base + dof;
+        double dm_0 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof);
+        double dm_1 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + KS::num_bf);
+        double dm_2 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + 2*KS::num_bf);
+        double Fx = fx[f_ofs];
+        double Fy = fy[f_ofs];
+        double Fz = fz[f_ofs];
+
+        acc_dfx_dy += J[jac_ofs+3]*dm_0*Fx;
+        acc_dfx_dy += J[jac_ofs+4]*dm_1*Fx;
+        acc_dfx_dy += J[jac_ofs+5]*dm_2*Fx;
+        acc_dfx_dz += J[jac_ofs+6]*dm_0*Fx;
+        acc_dfx_dz += J[jac_ofs+7]*dm_1*Fx;
+        acc_dfx_dz += J[jac_ofs+8]*dm_2*Fx;
+
+        acc_dfy_dx += J[jac_ofs+0]*dm_0*Fy;
+        acc_dfy_dx += J[jac_ofs+1]*dm_1*Fy;
+        acc_dfy_dx += J[jac_ofs+2]*dm_2*Fy;
+        acc_dfy_dz += J[jac_ofs+6]*dm_0*Fy;
+        acc_dfy_dz += J[jac_ofs+7]*dm_1*Fy;
+        acc_dfy_dz += J[jac_ofs+8]*dm_2*Fy;
+
+        acc_dfz_dx += J[jac_ofs+0]*dm_0*Fz;
+        acc_dfz_dx += J[jac_ofs+1]*dm_1*Fz;
+        acc_dfz_dx += J[jac_ofs+2]*dm_2*Fz;
+        acc_dfz_dy += J[jac_ofs+3]*dm_0*Fz;
+        acc_dfz_dy += J[jac_ofs+4]*dm_1*Fz;
+        acc_dfz_dy += J[jac_ofs+5]*dm_2*Fz;
+    }
+
+    curl_fx[output_dof_offset] = alpha * (acc_dfz_dy - acc_dfy_dz);
+    curl_fy[output_dof_offset] = alpha * (acc_dfx_dz - acc_dfz_dx);
+    curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy);
+}
+
+template<size_t K>
+__global__ void
+gpu_curl_planar(const double *fx, const double *fy, const double *fz,
+    const double * __restrict__ J, gpuTextureObject_t DM_tex, double alpha,
+    double * __restrict__ curl_fx, double * __restrict__ curl_fy, double * __restrict__ curl_fz,
+    const double * __restrict__ src_x, double * __restrict__ src_y, double * __restrict__ src_z,
+    int32_t num_all_elems, const int32_t* orients, int32_t dof_base)
+{
+    using KS = kernel_gpu_sizes<K>;
+
+    const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (ofs_in_entity >= num_all_elems*KS::num_bf)
+        return;
+
+    const int32_t output_dof_offset = dof_base + ofs_in_entity;
+
+    const int32_t iT = ofs_in_entity / KS::num_bf;
+    const int32_t elem_dof_base = dof_base + iT*KS::num_bf;
+    const int32_t iO = orients[iT];
+    const int32_t DM_row = ofs_in_entity % KS::num_bf;
+    const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t DM_base = DM_orient + DM_row;
+    const int32_t jac_ofs = 9*iT;
+
+    double acc_dfx_dy = 0.0;
+    double acc_dfx_dz = 0.0;
+
+    double acc_dfy_dx = 0.0;
+    double acc_dfy_dz = 0.0;
+
+    double acc_dfz_dx = 0.0;
+    double acc_dfz_dy = 0.0;
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t f_ofs = elem_dof_base + dof;
+        double dm_0 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof);
+        double dm_1 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + KS::num_bf);
+        double dm_2 = fetch_tex(DM_tex, DM_base + 3*KS::num_bf*dof + 2*KS::num_bf);
+        double Fx = fx[f_ofs];
+        double Fy = fy[f_ofs];
+        double Fz = fz[f_ofs];
+        
+        acc_dfx_dy += J[jac_ofs+3]*dm_0*Fx;
+        acc_dfx_dy += J[jac_ofs+4]*dm_1*Fx;
+        acc_dfx_dy += J[jac_ofs+5]*dm_2*Fx;
+        acc_dfx_dz += J[jac_ofs+6]*dm_0*Fx;
+        acc_dfx_dz += J[jac_ofs+7]*dm_1*Fx;
+        acc_dfx_dz += J[jac_ofs+8]*dm_2*Fx;
+
+        //F = fy[f_ofs];
+        acc_dfy_dx += J[jac_ofs+0]*dm_0*Fy;
+        acc_dfy_dx += J[jac_ofs+1]*dm_1*Fy;
+        acc_dfy_dx += J[jac_ofs+2]*dm_2*Fy;
+        acc_dfy_dz += J[jac_ofs+6]*dm_0*Fy;
+        acc_dfy_dz += J[jac_ofs+7]*dm_1*Fy;
+        acc_dfy_dz += J[jac_ofs+8]*dm_2*Fy;
+
+        //F = fz[f_ofs];
+        acc_dfz_dx += J[jac_ofs+0]*dm_0*Fz;
+        acc_dfz_dx += J[jac_ofs+1]*dm_1*Fz;
+        acc_dfz_dx += J[jac_ofs+2]*dm_2*Fz;
+        acc_dfz_dy += J[jac_ofs+3]*dm_0*Fz;
+        acc_dfz_dy += J[jac_ofs+4]*dm_1*Fz;
+        acc_dfz_dy += J[jac_ofs+5]*dm_2*Fz;
+    }
+
+    curl_fx[output_dof_offset] = alpha * (acc_dfz_dy - acc_dfy_dz) - src_x[output_dof_offset];
+    curl_fy[output_dof_offset] = alpha * (acc_dfx_dz - acc_dfz_dx) - src_y[output_dof_offset];
+    curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy) - src_z[output_dof_offset];
+}
+
 /* compute α∇F */
 template<size_t K>
 __global__ void
@@ -65,63 +206,57 @@ gpu_deriv_planar(const double *F, const double * __restrict__ J,
 }
 
 template<size_t K>
-__global__ void
-gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
-    gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
-    double * __restrict__ dF_dy, double * __restrict__ dF_dz,
-    int32_t num_all_elems, int32_t* orients, int32_t dof_base)
+void
+launch_curl_kernel(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    cudaStream_t stream)
 {
+    auto J = edg.jacobians.data();
+    auto Dtex = edg.differentiation_matrices.get_texture();
+    int32_t num_elems = edg.num_all_elems;
+    auto orients = edg.element_orientation.data();
+    auto num_orients = edg.element_orientation.size();
+
     using KS = kernel_gpu_sizes<K>;
 
-    if (threadIdx.x >= KS::dofs_per_dblock)
-        return;
-    const int32_t y_idx = (blockIdx.y * blockDim.y + threadIdx.y);
-    const int32_t elem_base = y_idx * KS::cells_per_dblock;
-    const int32_t elem_ofs = threadIdx.x / KS::num_bf;
-    const int32_t iT = elem_base + elem_ofs;
+    auto num_blocks = edg.num_bf*edg.num_all_elems/KS::curl_threads;
+    if (edg.num_bf*edg.num_all_elems % KS::curl_threads)
+        num_blocks += 1;
 
-    if (iT >= num_all_elems)
-        return;
-    
-    const int32_t block_dof_base = y_idx * KS::dblock_size;
+    if (edg.g_order == 1)
+        gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
+            J, Dtex, alpha, curl_fx, curl_fy, curl_fz, num_elems, orients, edg.dof_base);
+    //else
+    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
+}
 
-    const int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
+template<size_t K>
+void
+launch_curl_kernel(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    double *src_x, double *src_y, double *src_z,
+    cudaStream_t stream)
+{
+    auto J = edg.jacobians.data();
+    auto Dtex = edg.differentiation_matrices.get_texture();
+    int32_t num_elems = edg.num_all_elems;
+    auto orients = edg.element_orientation.data();
+    auto num_orients = edg.element_orientation.size();
 
-    const int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
-    const int32_t iO = orients[iT];
-    const int32_t DM_row = threadIdx.x % KS::num_bf;
-    const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
-    const int32_t DM_base = DM_orient + DM_row;
-    const int32_t jac_ofs = 9*iT;
+    using KS = kernel_gpu_sizes<K>;
 
-    double accm_dF_dx = 0.0;
-    double accm_dF_dy = 0.0;
-    double accm_dF_dz = 0.0;
-    for (int32_t dof = 0; dof < KS::num_bf; dof++)
-    {
-        int32_t d_ofs = DM_base + 3*KS::num_bf*dof;
-        int32_t f_ofs = elem_dof_base + dof;
-        double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
-        accm_dF_dx += J[jac_ofs+0] * v;
-        accm_dF_dy += J[jac_ofs+3] * v;
-        accm_dF_dz += J[jac_ofs+6] * v;
-    
-        d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf;
-        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
-        accm_dF_dx += J[jac_ofs+1] * v;
-        accm_dF_dy += J[jac_ofs+4] * v;
-        accm_dF_dz += J[jac_ofs+7] * v;
-    
-        d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf;
-        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
-        accm_dF_dx += J[jac_ofs+2] * v;
-        accm_dF_dy += J[jac_ofs+5] * v;
-        accm_dF_dz += J[jac_ofs+8] * v;
-    }
+    auto num_blocks = edg.num_bf*edg.num_all_elems/KS::curl_threads;
+    if (edg.num_bf*edg.num_all_elems % KS::curl_threads)
+        num_blocks += 1;
 
-    dF_dx[output_dof_offset] = accm_dF_dx;
-    dF_dy[output_dof_offset] = accm_dF_dy;
-    dF_dz[output_dof_offset] = accm_dF_dz;
+    if (edg.g_order == 1)
+        gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
+            J, Dtex, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, 
+            num_elems, orients, edg.dof_base);
+    //else
+    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }
 
 template<size_t K>
@@ -149,6 +284,83 @@ launch_deriv_kernel(const entity_data_gpu& edg,
     //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }
 
+void
+gpu_compute_field_curl(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    cudaStream_t stream)
+{
+    
+    switch (edg.a_order)
+    {
+        case 1:
+            launch_curl_kernel<1>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+
+        case 2:
+            launch_curl_kernel<2>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+    
+        case 3:
+            launch_curl_kernel<3>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+
+        case 4:
+            launch_curl_kernel<4>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+      
+        case 5:
+            launch_curl_kernel<5>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+
+        case 6:
+            launch_curl_kernel<6>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, stream);
+            break;
+
+        default:
+            throw std::invalid_argument("compute_field_curl: invalid order");
+    }
+}
+
+void
+gpu_compute_field_curl(const entity_data_gpu& edg,
+    const double *fx, const double *fy, const double *fz, const double alpha,
+    double *curl_fx, double* curl_fy, double* curl_fz,
+    double *src_x, double *src_y, double *src_z,
+    cudaStream_t stream)
+{
+    
+    switch (edg.a_order)
+    {
+        case 1:
+            launch_curl_kernel<1>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+
+        case 2:
+            launch_curl_kernel<2>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+    
+        case 3:
+            launch_curl_kernel<3>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+
+        case 4:
+            launch_curl_kernel<4>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+      
+        case 5:
+            launch_curl_kernel<5>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+
+        case 6:
+            launch_curl_kernel<6>(edg, fx, fy, fz, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, stream);
+            break;
+
+        default:
+            throw std::invalid_argument("compute_field_curl: invalid order");
+    }
+}
+
 void
 gpu_compute_field_derivatives(const entity_data_gpu& edg,
     const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha,
diff --git a/src/maxwell/maxwell_cpu.cpp b/src/maxwell/maxwell_cpu.cpp
index d72bb78262cca87973b4cde8bb333d260f541395..bd04eb3a38b1d4efce227c42198ec8387028bf54 100644
--- a/src/maxwell/maxwell_cpu.cpp
+++ b/src/maxwell/maxwell_cpu.cpp
@@ -112,6 +112,35 @@ init_from_model(const model& mod, solver_state& state, const maxwell::parameter_
 /**************************************************************************
  * Computational kernels: curls
  */
+#ifdef NEW_CURLS
+static void
+compute_curls(solver_state& state, const field& curr, field& next)
+{
+    for (const auto& ed : state.eds)
+    {
+        compute_field_curl(ed, curr.Ex, curr.Ey, curr.Ez, -1.0, next.Hx, next.Hy, next.Hz);
+        compute_field_curl(ed, curr.Hx, curr.Hy, curr.Hz, 1.0, next.Ex, next.Ey, next.Ez, state.Jx_src, state.Jy_src, state.Jz_src);
+    }
+}
+
+static void
+compute_curls_E(solver_state& state, const field& curr, field& next)
+{
+    for (const auto& ed : state.eds)
+    {
+        compute_field_curl(ed, curr.Ex, curr.Ey, curr.Ez, -1.0, next.Hx, next.Hy, next.Hz);
+    }
+}
+
+static void
+compute_curls_H(solver_state& state, const field& curr, field& next)
+{
+    for (const auto& ed : state.eds)
+    {
+        compute_field_curl(ed, curr.Hx, curr.Hy, curr.Hz, 1.0, next.Ex, next.Ey, next.Ez, state.Jx_src, state.Jy_src, state.Jz_src);
+    }
+}
+#else
 static void
 compute_curls(solver_state& state, const field& curr, field& next)
 {
@@ -162,6 +191,8 @@ compute_curls_H(solver_state& state, const field& curr, field& next)
     next.Ey = state.dz.Hx - state.dx.Hz - state.Jy_src;
     next.Ez = state.dx.Hy - state.dy.Hx - state.Jz_src;
 }
+#endif
+
 
 
 /**************************************************************************
diff --git a/src/maxwell/maxwell_gpu.cpp b/src/maxwell/maxwell_gpu.cpp
index e4941e7af16e994b7fc1a68f99b2ce5130db6bb6..be7fe862c25faa885a7919b61b2c9d30fc4b66cc 100644
--- a/src/maxwell/maxwell_gpu.cpp
+++ b/src/maxwell/maxwell_gpu.cpp
@@ -128,10 +128,12 @@ void init_from_model(const model& mod, solver_state_gpu& state, const maxwell::p
 {
     state.emf_curr.resize( mod.num_dofs() );
     state.emf_next.resize( mod.num_dofs() );
+
+#ifndef NEW_CURLS
     state.dx.resize( mod.num_dofs() );
     state.dy.resize( mod.num_dofs() );
     state.dz.resize( mod.num_dofs() );
-
+#endif
     state.jumps.resize( mod.num_fluxes() );
     state.fluxes.resize( mod.num_fluxes() );
 
@@ -188,6 +190,74 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field&
     }
 }
 
+#ifdef NEW_CURLS
+static void
+compute_curls(solver_state_gpu& state, const field_gpu&curr, field_gpu& next, bool use_sources)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+    auto Jx = state.Jx_src_gpu.data();
+    auto Jy = state.Jy_src_gpu.data();
+    auto Jz = state.Jz_src_gpu.data();
+
+    if (use_sources)
+    {
+        for (const auto& edg : state.edgs)
+        {
+            gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream);
+            gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, Jx, Jy, Jz, state.compute_stream);
+        }
+    }
+    else {
+        for (const auto& edg : state.edgs)
+        {
+            gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream);
+            gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, state.compute_stream);
+        }
+
+    }
+}
+
+static void
+compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+
+    for (const auto& edg : state.edgs)
+    {
+        gpu_compute_field_curl(edg, currp.Ex, currp.Ey, currp.Ez, -1.0, nextp.Hx, nextp.Hy, nextp.Hz, state.compute_stream);
+    }
+}
+
+static void
+compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+    auto Jx = state.Jx_src_gpu.data();
+    auto Jy = state.Jy_src_gpu.data();
+    auto Jz = state.Jz_src_gpu.data();
+    
+
+    if (use_sources)
+    {
+        for (const auto& edg : state.edgs)
+        {
+            gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, Jx, Jy, Jz, state.compute_stream);
+        }
+
+    }
+    else 
+    {
+        for (const auto& edg : state.edgs)
+        {
+            gpu_compute_field_curl(edg, currp.Hx, currp.Hy, currp.Hz, 1.0, nextp.Ex, nextp.Ey, nextp.Ez, state.compute_stream);
+        }
+    }
+
+}
+#else
 static void
 compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
 {
@@ -282,8 +352,8 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next,
         gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
         gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
     }
-
 }
+#endif
 
 static void
 compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
diff --git a/src/maxwell/maxwell_solver.cpp b/src/maxwell/maxwell_solver.cpp
index 9f39c94a5973b58f93b5d83c00da9dafe3a75e9c..d5e3efb992a9ccc19928655db1c214209da23a1a 100644
--- a/src/maxwell/maxwell_solver.cpp
+++ b/src/maxwell/maxwell_solver.cpp
@@ -247,6 +247,7 @@ void solver_mainloop(const model& mod, State& state, maxwell::parameter_loader&
         do_sources(mod, state, mpl);
         swap(state, mpl);
         
+        cudaDeviceSynchronize(); 
         double time = tc.toc();
         total_iteration_time += time;
 
diff --git a/tests/profile_curl_gpu.cpp b/tests/profile_curl_gpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90999b70941662942b9740d798b1a51370722472
--- /dev/null
+++ b/tests/profile_curl_gpu.cpp
@@ -0,0 +1,168 @@
+/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
+ * solver for Conservation Laws.
+ *
+ * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
+ * 
+ * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
+ */
+
+#include <iostream>
+#include <iomanip>
+
+#include <unistd.h>
+
+#include "test.h"
+#include "sgr.hpp"
+#include "libgmshdg/gmsh_io.h"
+#include "libgmshdg/entity_data.h"
+#include "libgmshdg/kernels_cpu.h"
+#include "timecounter.h"
+
+#include "libgmshdg/kernels_gpu.h"
+
+using namespace sgr;
+
+static void
+make_geometry(int order, double mesh_h)
+{
+    gm::add("difftest");
+    gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 1.0);
+    gmo::synchronize();
+
+    gvp_t vp;
+    gm::getEntities(vp);
+    gmm::setSize(vp, mesh_h);
+}
+
+int profile_differentiation(int geometric_order, int approximation_order)
+{
+    std::cout << cyanfg << "Testing geometric order " << geometric_order;
+    std::cout << ", approximation order = " << approximation_order << nofg;
+    std::cout << std::endl;
+
+    auto fx = [](const point_3d& pt) -> double {
+        return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+    auto fy = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto fz = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    
+    auto cfx = [](const point_3d& pt) -> double {
+        return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto cfy = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    auto cfz = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+
+        double h = 0.04;
+        make_geometry(0,h);
+        gmm::generate( DIMENSION(3) );
+
+        model mod(geometric_order, approximation_order);
+        mod.build();
+
+        auto& e0 = mod.entity_at(0);
+        e0.sort_by_orientation();
+
+        vecxd proj_fx = e0.project(fx);
+        vecxd proj_fy = e0.project(fy);
+        vecxd proj_fz = e0.project(fz);
+
+        /* Make CPU entity data */
+        entity_data_cpu ed;
+        e0.populate_entity_data(ed, mod);
+        /* Derive GPU entity data */
+        entity_data_gpu edg(ed);
+
+        auto model_num_dofs = mod.num_dofs();
+
+        /* Prepare I/O vectors and call kernel */
+        device_vector<double> proj_fx_gpu(proj_fx.data(), proj_fx.size());
+        device_vector<double> proj_fy_gpu(proj_fy.data(), proj_fy.size());
+        device_vector<double> proj_fz_gpu(proj_fz.data(), proj_fz.size());
+
+        device_vector<double> comp_cfx_gpu(model_num_dofs);
+        device_vector<double> comp_cfy_gpu(model_num_dofs);
+        device_vector<double> comp_cfz_gpu(model_num_dofs);
+
+        timecounter_gpu tc;
+        tc.tic();
+        gpu_compute_field_curl(edg, proj_fx_gpu.data(), proj_fy_gpu.data(), proj_fz_gpu.data(), 1.0,
+            comp_cfx_gpu.data(), comp_cfy_gpu.data(), comp_cfz_gpu.data());
+        double time = tc.toc();
+
+        vecxd comp_cfx = vecxd::Zero(model_num_dofs);
+        vecxd comp_cfy = vecxd::Zero(model_num_dofs);
+        vecxd comp_cfz = vecxd::Zero(model_num_dofs);
+
+        comp_cfx_gpu.copyout( comp_cfx.data() );
+        comp_cfy_gpu.copyout( comp_cfy.data() );
+        comp_cfz_gpu.copyout( comp_cfz.data() );
+
+        if (geometric_order == 1)
+        {
+            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+            double flops = (45*ed.num_bf + 3)*ed.num_bf*e0.num_cells();
+            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+        }
+        else
+        {
+            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+            double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*e0.num_cells();
+            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+        }
+
+        vecxd proj_cfx = e0.project(cfx);
+        vecxd proj_cfy = e0.project(cfy);
+        vecxd proj_cfz = e0.project(cfz);
+
+        double err_x = 0.0;
+        double err_y = 0.0;
+        double err_z = 0.0;
+
+        for (size_t iT = 0; iT < e0.num_cells(); iT++)
+        {
+            auto& pe = e0.cell(iT);
+            auto& re = e0.cell_refelem(pe);
+            matxd mass = e0.mass_matrix(iT);
+            auto num_bf = re.num_basis_functions();
+            vecxd diff_x = comp_cfx.segment(iT*num_bf, num_bf) - proj_cfx.segment(iT*num_bf, num_bf); 
+            vecxd diff_y = comp_cfy.segment(iT*num_bf, num_bf) - proj_cfy.segment(iT*num_bf, num_bf); 
+            vecxd diff_z = comp_cfz.segment(iT*num_bf, num_bf) - proj_cfz.segment(iT*num_bf, num_bf); 
+        
+            err_x += diff_x.dot(mass*diff_x);
+            err_y += diff_y.dot(mass*diff_y);
+            err_z += diff_z.dot(mass*diff_z);
+        }
+        std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
+        std::cout << " " << std::sqrt(err_z) << std::endl;
+
+    return 0;
+}
+
+int main(int argc, const char *argv[])
+{
+    gmsh::initialize();
+    gmsh::option::setNumber("General.Terminal", 0);
+    gmsh::option::setNumber("Mesh.Algorithm", 1);
+    gmsh::option::setNumber("Mesh.Algorithm3D", 1);
+
+    if (argc != 3)
+    {
+        std::cout << argv[0] << "<gorder> <aorder>" << std::endl;
+        return 1;
+    }
+
+    auto go = atoi(argv[1]);
+    auto ao = atoi(argv[2]);
+
+    profile_differentiation(go, ao);
+
+    return 0;
+}
diff --git a/tests/profile_differentiation_gpu.cpp b/tests/profile_differentiation_gpu.cpp
index 88d7767d690eeab71b9c0e33c8b8d3980a9120c5..ed8b71ef41c0d10214c222ebee82cb134546c5e7 100644
--- a/tests/profile_differentiation_gpu.cpp
+++ b/tests/profile_differentiation_gpu.cpp
@@ -55,6 +55,8 @@ int profile_differentiation(int geometric_order, int approximation_order)
 
         double h = 0.04;
         make_geometry(0,h);
+        gmm::generate( DIMENSION(3) );
+
         model mod(geometric_order, approximation_order);
         mod.build();
 
@@ -104,7 +106,7 @@ int profile_differentiation(int geometric_order, int approximation_order)
         vecxd Pdf_dx_e0 = e0.project(df_dx);
         vecxd Pdf_dy_e0 = e0.project(df_dy);
         vecxd Pdf_dz_e0 = e0.project(df_dz);
-
+/*
         double err_x = 0.0;
         double err_y = 0.0;
         double err_z = 0.0;
@@ -125,7 +127,7 @@ int profile_differentiation(int geometric_order, int approximation_order)
         }
         std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
         std::cout << " " << std::sqrt(err_z) << std::endl;
-
+*/
     return 0;
 }
 
diff --git a/tests/test_curl.cpp b/tests/test_curl.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ee57fcf1a6a24e930c9865ce4159b7fc45c6fe39
--- /dev/null
+++ b/tests/test_curl.cpp
@@ -0,0 +1,251 @@
+/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
+ * solver for Conservation Laws.
+ *
+ * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
+ * 
+ * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
+ */
+
+#include <iostream>
+#include <iomanip>
+
+#include <unistd.h>
+
+#include "test.h"
+#include "sgr.hpp"
+#include "timecounter.h"
+#include "libgmshdg/gmsh_io.h"
+#include "libgmshdg/entity_data.h"
+#include "libgmshdg/kernels_cpu.h"
+#include "libgmshdg/silo_output.hpp"
+
+using namespace sgr;
+
+static void
+make_geometry(int order, double mesh_h)
+{
+    gm::add("difftest");
+
+    std::vector<std::pair<int,int>> objects;
+
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) )
+        );
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) )
+        );
+
+    std::vector<std::pair<int, int>> tools;
+    gmsh::vectorpair odt;
+    std::vector<gmsh::vectorpair> odtm;
+    gmo::fragment(objects, tools, odt, odtm);
+
+    gmo::synchronize();
+
+    gvp_t vp;
+    gm::getEntities(vp);
+    gmm::setSize(vp, mesh_h);
+}
+
+int test_curl_convergence(int geometric_order, int approximation_order)
+{
+    std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 });
+    std::vector<double> errors;
+
+    std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order;
+    std::cout << ", approximation order = " << approximation_order << nofg;
+    std::cout << std::endl;
+
+    auto fx = [](const point_3d& pt) -> double {
+        return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+    auto fy = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto fz = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    
+    auto cfx = [](const point_3d& pt) -> double {
+        return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto cfy = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    auto cfz = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+
+    std::vector<double> errs_x;
+    std::vector<double> errs_y;
+    std::vector<double> errs_z;
+
+    for (size_t i = 0; i < sizes.size(); i++)
+    {
+        double h = sizes[i];
+        make_geometry(0,h);
+        gmm::generate( DIMENSION(3) );
+
+        model mod(geometric_order, approximation_order);
+        mod.build();
+
+        std::stringstream ss;
+        ss << "diff_go_" << geometric_order << "_ao_" << approximation_order;
+        ss << "_seq_" << i << ".silo";
+
+#ifdef WRITE_TEST_OUTPUTS
+        silo silodb;
+        silodb.create_db(ss.str());
+        silodb.import_mesh_from_gmsh();
+        silodb.write_mesh();
+#endif
+        auto model_num_dofs = mod.num_dofs();
+
+        vecxd proj_fx = vecxd::Zero(model_num_dofs);
+        vecxd proj_fy = vecxd::Zero(model_num_dofs);
+        vecxd proj_fz = vecxd::Zero(model_num_dofs);
+        
+        vecxd proj_cfx = vecxd::Zero(model_num_dofs);
+        vecxd proj_cfy = vecxd::Zero(model_num_dofs);
+        vecxd proj_cfz = vecxd::Zero(model_num_dofs);
+
+        vecxd comp_cfx = vecxd::Zero(model_num_dofs);
+        vecxd comp_cfy = vecxd::Zero(model_num_dofs);
+        vecxd comp_cfz = vecxd::Zero(model_num_dofs);
+
+        std::vector<entity_data_cpu> eds;
+
+        for (auto& e : mod)
+        {
+            e.project(fx, proj_fx);
+            e.project(fy, proj_fy);
+            e.project(fz, proj_fz);
+            
+            e.project(cfx, proj_cfx);
+            e.project(cfy, proj_cfy);
+            e.project(cfz, proj_cfz);
+
+            entity_data_cpu ed;
+            e.populate_entity_data(ed, mod);
+            eds.push_back( std::move(ed) );
+        }
+
+        for (auto& ed : eds)
+        {
+            timecounter tc;
+            tc.tic();
+            compute_field_curl(ed, proj_fx, proj_fy, proj_fz, 1.0, comp_cfx, comp_cfy, comp_cfz);
+            double time = tc.toc();
+
+            auto num_cells = num_elems_all_orientations(ed);
+            auto num_bf = num_dofs_3D(approximation_order);
+            auto field_size = num_cells*num_bf;
+
+            if (geometric_order == 1)
+            {
+                std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = (45*ed.num_bf+6)*ed.num_bf*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+
+
+                double gbytes = (6*field_size    // input curl and accumulators
+                    +num_cells*3*3)*8/1e9;
+                std::cout << "Kernel bandwidth: " << gbytes/time << " GB/s" << std::endl;
+            }
+            else
+            {
+                std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = ((45*ed.num_bf+12)*ed.num_bf*ed.num_qp + 6*((2*ed.num_bf-1) + 1)*ed.num_bf)*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
+        }
+        
+        double err_x = 0.0;
+        double err_y = 0.0;
+        double err_z = 0.0;
+
+#ifdef WRITE_TEST_OUTPUTS
+        auto model_num_cells = mod.num_cells();
+        std::vector<double> var_cfx(model_num_cells);
+        std::vector<double> var_cfy(model_num_cells);
+        std::vector<double> var_cfz(model_num_cells);
+#endif
+
+        for (auto& e : mod)
+        {
+            for (size_t iT = 0; iT < e.num_cells(); iT++)
+            {
+                auto& pe = e.cell(iT);
+                auto& re = e.cell_refelem(pe);
+                matxd mass = e.mass_matrix(iT);
+                auto num_bf = re.num_basis_functions();
+                auto ofs = e.cell_model_dof_offset(iT);
+                vecxd diff_x = proj_cfx.segment(ofs, num_bf) - comp_cfx.segment(ofs, num_bf); 
+                vecxd diff_y = proj_cfy.segment(ofs, num_bf) - comp_cfy.segment(ofs, num_bf); 
+                vecxd diff_z = proj_cfz.segment(ofs, num_bf) - comp_cfz.segment(ofs, num_bf); 
+            
+                err_x += diff_x.dot(mass*diff_x);
+                err_y += diff_y.dot(mass*diff_y);
+                err_z += diff_z.dot(mass*diff_z);
+
+#ifdef WRITE_TEST_OUTPUTS
+                auto gi = e.cell_global_index_by_gmsh(iT);
+                assert(gi < model_num_cells);
+                vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+                var_cfx[gi] = comp_cfx.segment(ofs, num_bf).dot(phi_bar);
+                var_cfy[gi] = comp_cfy.segment(ofs, num_bf).dot(phi_bar);
+                var_cfz[gi] = comp_cfz.segment(ofs, num_bf).dot(phi_bar);
+#endif
+            }
+            std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
+            std::cout << " " << std::sqrt(err_z) << std::endl;
+        }
+
+#ifdef WRITE_TEST_OUTPUTS
+        silodb.write_zonal_variable("cfx", var_cfx);
+        silodb.write_zonal_variable("cfy", var_cfy);
+        silodb.write_zonal_variable("cfz", var_cfz);
+#endif
+    
+        errs_x.push_back( std::sqrt(err_x) );
+        errs_y.push_back( std::sqrt(err_y) );
+        errs_z.push_back( std::sqrt(err_z) );
+        std::cout << std::endl;
+    }
+
+    double rate_x = 0.0;
+    double rate_y = 0.0;
+    double rate_z = 0.0;
+
+    std::cout << Byellowfg << "rate df/dx  rate df/dy  rate df/dz" << reset << std::endl;
+    for (size_t i = 1; i < sizes.size(); i++)
+    {
+        std::cout << (rate_x = std::log2(errs_x[i-1]/errs_x[i]) ) << " ";
+        std::cout << (rate_y = std::log2(errs_y[i-1]/errs_y[i]) ) << " ";
+        std::cout << (rate_z = std::log2(errs_z[i-1]/errs_z[i]) ) << std::endl;
+    }
+
+    COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2);
+    COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2);
+    COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2);
+
+    return 0;
+}
+
+int main(void)
+{
+    gmsh::initialize();
+    gmsh::option::setNumber("General.Terminal", 0);
+    gmsh::option::setNumber("Mesh.Algorithm", 1);
+    gmsh::option::setNumber("Mesh.Algorithm3D", 1);
+
+
+    int failed_tests = 0;
+
+    std::cout << Bmagentafg << " *** TESTING: CURL ***" << reset << std::endl;
+    for (size_t go = 1; go < 2; go++)
+        for (size_t ao = 3; ao < 5; ao++)
+            failed_tests += test_curl_convergence(go, ao);
+
+    return failed_tests;
+}
diff --git a/tests/test_curl_gpu.cpp b/tests/test_curl_gpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d10b51d30b5884cd56d2e6d950eaa55af0d0baba
--- /dev/null
+++ b/tests/test_curl_gpu.cpp
@@ -0,0 +1,264 @@
+/* This is GMSH/DG, a GPU-Accelerated Nodal Discontinuous Galerkin
+ * solver for Conservation Laws.
+ *
+ * Copyright (C) 2020-2022 Matteo Cicuttin - University of Liège
+ * 
+ * This code is released under GNU AGPLv3 license, see LICENSE.txt for details.
+ */
+
+#include <iostream>
+#include <iomanip>
+
+#include <unistd.h>
+
+#include "test.h"
+#include "sgr.hpp"
+#include "timecounter.h"
+#include "libgmshdg/gmsh_io.h"
+#include "libgmshdg/entity_data.h"
+#include "libgmshdg/kernels_cpu.h"
+#include "libgmshdg/silo_output.hpp"
+#include "libgmshdg/kernels_gpu.h"
+
+using namespace sgr;
+
+static void
+make_geometry(int order, double mesh_h)
+{
+    gm::add("difftest");
+
+    std::vector<std::pair<int,int>> objects;
+
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) )
+        );
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) )
+        );
+
+    std::vector<std::pair<int, int>> tools;
+    gmsh::vectorpair odt;
+    std::vector<gmsh::vectorpair> odtm;
+    gmo::fragment(objects, tools, odt, odtm);
+
+    gmo::synchronize();
+
+    gvp_t vp;
+    gm::getEntities(vp);
+    gmm::setSize(vp, mesh_h);
+}
+
+int test_curl_convergence(int geometric_order, int approximation_order)
+{
+    std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 });
+    std::vector<double> errors;
+
+    std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order;
+    std::cout << ", approximation order = " << approximation_order << nofg;
+    std::cout << std::endl;
+
+    auto fx = [](const point_3d& pt) -> double {
+        return std::sin(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+    auto fy = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto fz = [](const point_3d& pt) -> double {
+      return std::sin(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    
+    auto cfx = [](const point_3d& pt) -> double {
+        return M_PI*std::sin(M_PI*pt.x())*std::cos(M_PI*pt.y()) - M_PI*std::cos(M_PI*pt.z())*std::sin(M_PI*pt.x());
+    };
+    auto cfy = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.y())*std::cos(M_PI*pt.z()) - M_PI*std::cos(M_PI*pt.x())*std::sin(M_PI*pt.y());
+    };
+    auto cfz = [](const point_3d& pt) -> double {
+      return M_PI*std::sin(M_PI*pt.z())*std::cos(M_PI*pt.x()) - M_PI*std::cos(M_PI*pt.y())*std::sin(M_PI*pt.z());
+    };
+
+    std::vector<double> errs_x;
+    std::vector<double> errs_y;
+    std::vector<double> errs_z;
+
+    for (size_t i = 0; i < sizes.size(); i++)
+    {
+        double h = sizes[i];
+        make_geometry(0,h);
+        gmm::generate( DIMENSION(3) );
+
+        model mod(geometric_order, approximation_order);
+        mod.build();
+
+#ifdef WRITE_TEST_OUTPUTS
+        std::stringstream ss;
+        ss << "diff_go_" << geometric_order << "_ao_" << approximation_order;
+        ss << "_seq_" << i << ".silo";
+
+        silo silodb;
+        silodb.create_db(ss.str());
+        silodb.import_mesh_from_gmsh();
+        silodb.write_mesh();
+#endif
+        auto model_num_dofs = mod.num_dofs();
+
+        vecxd proj_fx = vecxd::Zero(model_num_dofs);
+        vecxd proj_fy = vecxd::Zero(model_num_dofs);
+        vecxd proj_fz = vecxd::Zero(model_num_dofs);
+        
+        vecxd proj_cfx = vecxd::Zero(model_num_dofs);
+        vecxd proj_cfy = vecxd::Zero(model_num_dofs);
+        vecxd proj_cfz = vecxd::Zero(model_num_dofs);
+
+        std::vector<entity_data_gpu> edgs;
+
+        size_t model_num_dofs_gpu = model_num_dofs;
+
+        for (auto& e : mod)
+        {
+
+            e.project(fx, proj_fx);
+            e.project(fy, proj_fy);
+            e.project(fz, proj_fz);
+            
+            e.project(cfx, proj_cfx);
+            e.project(cfy, proj_cfy);
+            e.project(cfz, proj_cfz);
+
+            entity_data_cpu ed;
+            e.populate_entity_data(ed, mod);
+            entity_data_gpu edg(ed);
+            edgs.push_back( std::move(edg) );
+        }
+
+         /* Prepare I/O vectors and call kernel */
+
+        device_vector<double> proj_fx_gpu(proj_fx.data(), proj_fx.size());
+        device_vector<double> proj_fy_gpu(proj_fy.data(), proj_fy.size());
+        device_vector<double> proj_fz_gpu(proj_fz.data(), proj_fz.size());
+
+        device_vector<double> comp_cfx_gpu(model_num_dofs_gpu);
+        device_vector<double> comp_cfy_gpu(model_num_dofs_gpu);
+        device_vector<double> comp_cfz_gpu(model_num_dofs_gpu);
+
+        for (auto& edg : edgs)
+        {
+            timecounter_gpu tc;
+            tc.tic();
+            gpu_compute_field_curl(edg, proj_fx_gpu.data(), proj_fy_gpu.data(), proj_fz_gpu.data(), 1.0,
+                comp_cfx_gpu.data(), comp_cfy_gpu.data(), comp_cfz_gpu.data());
+            double time = tc.toc();
+
+            auto num_cells = edg.num_all_elems;
+            if (geometric_order == 1)
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = (45*edg.num_bf+6)*edg.num_bf*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
+            else
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = ((45*edg.num_bf+12)*edg.num_bf /* *edg.num_qp */ + 6*((2*edg.num_bf-1) + 1)*edg.num_bf)*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
+        }
+
+        vecxd comp_cfx = vecxd::Zero(model_num_dofs_gpu);
+        vecxd comp_cfy = vecxd::Zero(model_num_dofs_gpu);
+        vecxd comp_cfz = vecxd::Zero(model_num_dofs_gpu);
+
+        comp_cfx_gpu.copyout( comp_cfx.data() );
+        comp_cfy_gpu.copyout( comp_cfy.data() );
+        comp_cfz_gpu.copyout( comp_cfz.data() );
+        
+        double err_x = 0.0;
+        double err_y = 0.0;
+        double err_z = 0.0;
+
+#ifdef WRITE_TEST_OUTPUTS
+        auto model_num_cells = mod.num_cells();
+        std::vector<double> var_cfz(model_num_cells);
+        std::vector<double> var_cfy(model_num_cells);
+        std::vector<double> var_cfz(model_num_cells);
+#endif
+
+        for (auto& e : mod)
+        {
+            for (size_t iT = 0; iT < e.num_cells(); iT++)
+            {
+                auto& pe = e.cell(iT);
+                auto& re = e.cell_refelem(pe);
+                matxd mass = e.mass_matrix(iT);
+                auto num_bf = re.num_basis_functions();
+                auto ofs = e.cell_model_dof_offset(iT);
+                vecxd diff_x = proj_cfx.segment(ofs, num_bf) - comp_cfx.segment(ofs, num_bf); 
+                vecxd diff_y = proj_cfy.segment(ofs, num_bf) - comp_cfy.segment(ofs, num_bf); 
+                vecxd diff_z = proj_cfz.segment(ofs, num_bf) - comp_cfz.segment(ofs, num_bf); 
+            
+                err_x += diff_x.dot(mass*diff_x);
+                err_y += diff_y.dot(mass*diff_y);
+                err_z += diff_z.dot(mass*diff_z);
+
+#ifdef WRITE_TEST_OUTPUTS
+                auto gi = e.cell_global_index_by_gmsh(iT);
+                assert(gi < model_num_cells);
+                vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+                var_cfz[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar);
+                var_cfy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar);
+                var_cfz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar);
+#endif
+            }
+            std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
+            std::cout << " " << std::sqrt(err_z) << std::endl;
+        }
+
+#ifdef WRITE_TEST_OUTPUTS
+        silodb.write_zonal_variable("df_dx", var_cfz);
+        silodb.write_zonal_variable("df_dy", var_cfy);
+        silodb.write_zonal_variable("df_dz", var_cfz);
+#endif
+    
+        errs_x.push_back( std::sqrt(err_x) );
+        errs_y.push_back( std::sqrt(err_y) );
+        errs_z.push_back( std::sqrt(err_z) );
+
+        std::cout << std::endl;
+    }
+
+    double rate_x = 0.0;
+    double rate_y = 0.0;
+    double rate_z = 0.0;
+
+    std::cout << Byellowfg << "rate df/dx  rate df/dy  rate df/dz" << reset << std::endl;
+    for (size_t i = 1; i < sizes.size(); i++)
+    {
+        std::cout << (rate_x = std::log2(errs_x[i-1]/errs_x[i]) ) << " ";
+        std::cout << (rate_y = std::log2(errs_y[i-1]/errs_y[i]) ) << " ";
+        std::cout << (rate_z = std::log2(errs_z[i-1]/errs_z[i]) ) << std::endl;
+    }
+
+    COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2);
+    COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2);
+    COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2);
+
+    return 0;
+}
+
+int main(void)
+{
+    gmsh::initialize();
+    gmsh::option::setNumber("General.Terminal", 0);
+    gmsh::option::setNumber("Mesh.Algorithm", 1);
+    gmsh::option::setNumber("Mesh.Algorithm3D", 1);
+
+
+    int failed_tests = 0;
+
+    std::cout << Bmagentafg << " *** TESTING: CURL ***" << reset << std::endl;
+    for (size_t go = 1; go < 2; go++)
+        for (size_t ao = 1; ao < 7; ao++)
+            failed_tests += test_curl_convergence(go, ao);
+
+    return failed_tests;
+}
diff --git a/tests/test_differentiation.cpp b/tests/test_differentiation.cpp
index 8bec165caef2cc3c717fbc4224204f3a4d68a5cc..a9080c713935a4c0d35717c59c83254a302e3f62 100644
--- a/tests/test_differentiation.cpp
+++ b/tests/test_differentiation.cpp
@@ -52,7 +52,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
     std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 });
     std::vector<double> errors;
 
-    std::cout << cyanfg << "Testing geometric order " << geometric_order;
+    std::cout << std::endl << cyanfg << "Testing geometric order " << geometric_order;
     std::cout << ", approximation order = " << approximation_order << nofg;
     std::cout << std::endl;
 
@@ -125,13 +125,13 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
             auto num_cells = num_elems_all_orientations(ed);
             if (geometric_order == 1)
             {
-                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: ";
                 double flops = 21*ed.num_bf*ed.num_bf*num_cells;
                 std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
             }
             else
             {
-                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                std::cout << num_cells << " elements. Kernel runtime: " << time << " seconds. Estimated performance: ";
                 double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_cells;
                 std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
             }
@@ -188,6 +188,8 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         errs_x.push_back( std::sqrt(err_x) );
         errs_y.push_back( std::sqrt(err_y) );
         errs_z.push_back( std::sqrt(err_z) );
+
+        std::cout << std::endl;
     }
 
     double rate_x = 0.0;
diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp
index f36cca30f4deeaf52a39bee3ed027da8249180c6..ecfd5210a81e5e082ad2a6bff49c4ee674cb2da0 100644
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -102,7 +102,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
 
         std::vector<entity_data_gpu> edgs;
         size_t model_num_dofs_gpu = model_num_dofs;
-
+        
         for (auto& e : mod)
         {
             e.project(f, Pf);