diff --git a/include/entity.h b/include/entity.h
index fc9219b97b32f734339db41f9ddabe8e1f3ca5c3..ebf3997f849c6670d7c8ff4911b72d725ee77bda 100644
--- a/include/entity.h
+++ b/include/entity.h
@@ -18,6 +18,10 @@ class entity
     int     g_order;
     int     a_order;
 
+    size_t                              m_dof_base;
+    size_t                              m_flux_base;
+    size_t                              m_index_base;
+
     entity_ordering                     cur_elem_ordering;
 
     std::vector<reference_element>      reference_cells;
@@ -56,6 +60,7 @@ public:
 
     double                      measure(void) const;
     vecxd                       project(const scalar_function&) const;
+    void                        project(const scalar_function&, vecxd&) const;
 
     size_t                      num_cells(void) const;
     size_t                      num_cell_orientations(void) const;
@@ -65,6 +70,11 @@ public:
     const reference_element&    cell_refelem(size_t) const;
     const reference_element&    cell_refelem(const physical_element&) const;
 
+    size_t                      cell_global_index(size_t) const;
+    size_t                      cell_global_index_by_gmsh(size_t) const;
+    size_t                      cell_local_dof_offset(size_t) const;
+    size_t                      cell_global_dof_offset(size_t) const;
+
     size_t                      num_faces(void) const;
 
     std::vector<size_t>         face_tags() const;
@@ -73,6 +83,14 @@ public:
     const reference_element&    face_refelem(size_t) const;
     const reference_element&    face_refelem(const physical_element&) const;
 
+    size_t                      num_dofs(void) const;
+    size_t                      num_fluxes(void) const;
+    void                        base(size_t, size_t, size_t);
+
+    size_t                      dof_base(void) const;
+    size_t                      flux_base(void) const;
+    size_t                      index_base(void) const;
+
     matxd                       mass_matrix(size_t) const;
     
     void                        sort_by_orientation(void);
@@ -84,7 +102,7 @@ public:
 
     void                        populate_entity_data(entity_data_cpu&) const;
 
-    constexpr size_t num_faces_per_elem(void) const { return 4; }
+    constexpr size_t            num_faces_per_elem(void) const { return 4; }
 
 
     std::vector<physical_element>::const_iterator  begin() const { return physical_cells.begin(); }
diff --git a/include/entity_data.h b/include/entity_data.h
index dd369949228ad52ec399e82028fb0aaafba4b191..6d96b3710ca93cc2cc1e68a76a84c29308e22b71 100644
--- a/include/entity_data.h
+++ b/include/entity_data.h
@@ -16,21 +16,21 @@ enum class entity_ordering {
 
 struct entity_data_cpu
 {
-    int                 g_order;
-    int                 a_order;
+    int                 g_order;            /* Geometric order */
+    int                 a_order;            /* Approximation order */
     std::vector<size_t> num_elems;          /* No. of elements in the entity */
     size_t              num_orientations;   /* No. of bf. orientations */
     size_t              num_bf;             /* No. of dofs per elem */
     size_t              num_qp;             /* No. of integration pts per elem */
-    size_t              num_fluxes;         /* No. of flux dofs per elem */
+    size_t              num_fluxes;         /* No. of fluxes per elem */
     size_t              num_face_qp;        /* No. of integration pts per face */
     size_t              num_faces;          /* No. of faces */
-    entity_ordering     ordering;
+    entity_ordering     ordering;           /* Entity element ordering */
+    size_t              dof_base;           /* Global start position for dofs */
+    size_t              flux_base;          /* Global start position for fluxes */
 
-    size_t              dof_base;           /* Offset in the global dof vector */
-    size_t              flux_base;          /* Offset in the global flux vector */
-    std::vector<size_t> fluxdofs_mine;      
-    std::vector<size_t> fluxdofs_neighbour;
+    std::vector<size_t> fluxdofs_mine;      /* Flux to DOF mapping, on myself */      
+    std::vector<size_t> fluxdofs_neighbour; /* Flux to DOF mapping, on neighbours */
 
     /* Inverse of mass matrices. Populated only if geometric order > 1.
      * num_bf*num_elems rows, num_bf columns. Matrix at offset elem*num_bf
@@ -76,6 +76,7 @@ struct entity_data_gpu
     size_t                      num_bf;
 
     size_t                      dof_base;
+    size_t                      flux_base;
 
 
     texture_allocator<double>   differentiation_matrices;
@@ -84,6 +85,7 @@ struct entity_data_gpu
 
 
     entity_data_gpu();
+    entity_data_gpu(entity_data_gpu&&) = default;
     entity_data_gpu(const entity_data_cpu&);
     ~entity_data_gpu();
 };
diff --git a/include/gmsh_io.h b/include/gmsh_io.h
index 190515764b364e82bae4e35a25d73e428d82af91..12d730b222624bec51a4ce607484b8037f8b2dd5 100644
--- a/include/gmsh_io.h
+++ b/include/gmsh_io.h
@@ -57,6 +57,10 @@ public:
         return boundary_map.at(which);
     }
 
+    size_t  num_dofs() const;
+    size_t  num_fluxes() const;
+    size_t  num_cells() const;
+
     std::vector<entity>::const_iterator begin() const;
     std::vector<entity>::const_iterator end() const;
     std::vector<entity>::iterator begin();
diff --git a/src/entity.cpp b/src/entity.cpp
index 361ebffbdd5d3bb99af6a675b0b26286ffcc4a3d..71f63439a6ffbfb5fa0fee9537211e6aea05d037 100644
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -9,7 +9,8 @@
 
 entity::entity(const entity_params& ep)
     : dim(ep.dim), tag(ep.tag), elemType(ep.etype), g_order(ep.gorder),
-      a_order(ep.aorder), cur_elem_ordering(entity_ordering::GMSH)
+      a_order(ep.aorder), cur_elem_ordering(entity_ordering::GMSH),
+      m_dof_base(0), m_flux_base(0), m_index_base(0)
 {
     /* Prepare reference elements */
     reference_elements_factory ref(ep);
@@ -59,6 +60,8 @@ entity::measure(void) const
     return meas;
 }
 
+/* Project a function on this entity and return the
+ * corresponding vector of dofs on this entity. */
 vecxd
 entity::project(const scalar_function& function) const
 {
@@ -92,6 +95,37 @@ entity::project(const scalar_function& function) const
     return ret;
 }
 
+/* Project a function on this entity and store the partial result in
+ * the global vector pf at the offset corresponding to this entity. */
+void
+entity::project(const scalar_function& function, vecxd& pf) const
+{
+    auto num_bf = reference_cells[0].num_basis_functions();
+
+    for (size_t iT = 0; iT < physical_cells.size(); iT++)
+    {
+        const auto& pe = physical_cells[iT];
+        assert( pe.orientation() < reference_cells.size() );
+        const auto& re = reference_cells[pe.orientation()];
+        const auto pqps = pe.integration_points();
+        const auto dets = pe.determinants();
+
+        assert(pqps.size() == dets.size());
+
+        matxd mass = matxd::Zero(num_bf, num_bf);
+        vecxd rhs = vecxd::Zero(num_bf);
+        for (size_t iQp = 0; iQp < pqps.size(); iQp++)
+        {
+            const auto& pqp = pqps[iQp];
+            const vecxd phi = re.basis_functions(iQp);
+            mass += pqp.weight() * phi * phi.transpose();
+            rhs += pqp.weight() * function(pqp.point()) * phi; 
+        }
+
+        pf.segment(m_dof_base + iT*num_bf, num_bf) = mass.ldlt().solve(rhs);
+    }
+}
+
 const reference_element&
 entity::cell_refelem(size_t iO) const
 {
@@ -106,6 +140,35 @@ entity::cell_refelem(const physical_element& pe) const
     return reference_cells[pe.orientation()];
 }
 
+size_t
+entity::cell_global_index(size_t iT) const
+{
+    assert(iT < physical_cells.size());
+    return m_index_base + iT;
+}
+
+size_t
+entity::cell_global_index_by_gmsh(size_t iT) const
+{
+    assert(iT < physical_cells.size());
+    return m_index_base + physical_cells[iT].original_position();
+}
+
+
+size_t
+entity::cell_local_dof_offset(size_t iT) const
+{
+    assert(reference_cells.size() > 0);
+    return iT * reference_cells[0].num_basis_functions();
+}
+
+size_t
+entity::cell_global_dof_offset(size_t iT) const
+{
+    assert(reference_cells.size() > 0);
+    return m_dof_base + iT * reference_cells[0].num_basis_functions();
+}
+
 const reference_element&
 entity::face_refelem(const physical_element& pe) const
 {
@@ -158,6 +221,45 @@ entity::face(size_t iF) const
     return physical_faces[iF];
 }
 
+size_t
+entity::num_dofs(void) const
+{
+    assert(reference_cells.size() > 0);
+    return physical_cells.size() * reference_cells[0].num_basis_functions();
+}
+
+size_t
+entity::num_fluxes(void) const
+{
+    assert(reference_faces.size() > 0);
+    return physical_faces.size() * reference_faces[0].num_basis_functions();
+}
+
+void
+entity::base(size_t d_base, size_t f_base, size_t i_base)
+{
+    m_dof_base = d_base;
+    m_flux_base = f_base;
+    m_index_base = i_base;
+}
+
+size_t
+entity::dof_base(void) const
+{
+    return m_dof_base;
+}
+
+size_t
+entity::flux_base(void) const
+{
+    return m_flux_base;
+}
+
+size_t
+entity::index_base(void) const
+{
+    return m_index_base;
+}
 
 matxd
 entity::mass_matrix(size_t iT) const
@@ -200,8 +302,10 @@ entity::sort_by_orientation(void)
         return false;
     };
 
+    /* Sort cells */
     std::sort(physical_cells.begin(), physical_cells.end(), comp);
 
+    /* Sort all the stuff related to faces */
     std::vector<physical_element> new_pf;
     new_pf.resize( physical_faces.size() );
     std::vector<size_t> new_ft;
@@ -578,8 +682,8 @@ entity::populate_entity_data(entity_data_cpu& ed) const
     ed.num_fluxes = reference_faces[0].num_basis_functions();
     ed.num_qp = (g_order == 1) ? 1 : reference_cells[0].num_integration_points();
     ed.num_face_qp = (g_order == 1) ? 1 : reference_faces[0].num_integration_points();
-    ed.dof_base = 0;
-    ed.flux_base = 0;
+    ed.dof_base = m_dof_base;
+    ed.flux_base = m_flux_base;
 
     populate_differentiation_matrices(ed);
     populate_jacobians(ed);
diff --git a/src/entity_data.cpp b/src/entity_data.cpp
index 5fef224fccfe036ef2d5c45aa05687e7a1f6b72e..d89975c296dffd7d2d0a5afa371e30eaf1737cee 100644
--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -22,8 +22,8 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
     element_orientation.copyin(eo.data(), eo.size());
 
     num_bf = ed.num_bf;
-
-    dof_base = 0;
+    dof_base = ed.dof_base;
+    flux_base = ed.flux_base;
     
     auto rows = ed.num_bf*ed.num_orientations;
     auto cols = ed.num_bf*3;
diff --git a/src/gmsh_io.cpp b/src/gmsh_io.cpp
index 6b13afd9f291dbd5e229b8086311e7b564ae7a3d..7009681dce42cb49332b9db161adfa02e29dfee3 100644
--- a/src/gmsh_io.cpp
+++ b/src/gmsh_io.cpp
@@ -96,7 +96,7 @@ model::begin() const
 std::vector<entity>::const_iterator
 model::end() const
 {
-    return entities.begin();
+    return entities.end();
 }
 
 std::vector<entity>::iterator
@@ -108,7 +108,7 @@ model::begin()
 std::vector<entity>::iterator
 model::end()
 {
-    return entities.begin();
+    return entities.end();
 }
 
 entity&
@@ -117,6 +117,36 @@ model::entity_at(size_t pos)
     return entities.at(pos);
 }
 
+size_t
+model::num_dofs(void) const
+{
+    size_t ret = 0;
+    for (const auto& e : entities)
+        ret += e.num_dofs();
+
+    return ret;
+}
+
+size_t
+model::num_fluxes(void) const
+{
+    size_t ret = 0;
+    for (const auto& e : entities)
+        ret += e.num_fluxes();
+
+    return ret;
+}
+
+size_t
+model::num_cells(void) const
+{
+    size_t ret = 0;
+    for (const auto& e : entities)
+        ret += e.num_cells();
+
+    return ret;
+}
+
 void
 model::update_connectivity(const entity& e, size_t entity_index)
 {
@@ -139,6 +169,9 @@ model::import_gmsh_entities(void)
     gm::getEntities(gmsh_entities, DIMENSION(3));
 
     size_t entity_index = 0;
+    size_t dof_base = 0;
+    size_t flux_base = 0;
+    size_t index_base = 0;
     for (auto [dim, tag] : gmsh_entities)
     {
         std::vector<int> elemTypes;
@@ -154,6 +187,11 @@ model::import_gmsh_entities(void)
                       .gorder = geometric_order, .aorder = approximation_order});
             
             e.sort_by_orientation();
+            e.base(dof_base, flux_base, index_base);
+            dof_base += e.num_dofs();
+            flux_base += e.num_fluxes();
+            index_base += e.num_cells();
+
             update_connectivity(e, entity_index);
 
             entities.push_back( std::move(e) );
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index bd95928d3e04bb892db370070550d86944470341..528311bd1466320a96c08eb224f4f1a11a2c8f83 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -6,18 +6,18 @@ __global__ void
 gpu_deriv_planar(const double *F, const double * __restrict__ J,
     gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
     double * __restrict__ dF_dy, double * __restrict__ dF_dz,
-    int32_t num_all_elems, int32_t* orients)
+    int32_t num_all_elems, int32_t* orients, int32_t dof_base)
 {
     using KS = kernel_gpu_sizes<K>;
 
     int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
-
-    int32_t cur_dof_offset = /*edg.dof_base + */ofs_in_entity;
-    if (cur_dof_offset >= num_all_elems*KS::num_bf)
+    if (ofs_in_entity >= num_all_elems*KS::num_bf)
         return;
 
-    int32_t iT = cur_dof_offset / KS::num_bf;
-    int32_t elem_dof_base = /*edg.dof_base + */iT*KS::num_bf;
+    int32_t cur_dof_offset = dof_base + ofs_in_entity;
+
+    int32_t iT = ofs_in_entity / KS::num_bf;
+    int32_t elem_dof_base = dof_base + iT*KS::num_bf;
     int32_t iO = orients[iT];
     int32_t DM_row = ofs_in_entity % KS::num_bf;
     int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
@@ -71,7 +71,7 @@ launch_deriv_kernel(entity_data_gpu& edg,
 
     if (edg.g_order == 1)
         gpu_deriv_planar<K><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
-            Dtex, df_dx, df_dy, df_dz, num_elems, orients);
+            Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
     //else
     //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }
diff --git a/tests/test_differentiation.cpp b/tests/test_differentiation.cpp
index 9ffc813b127b0feef2576bdcec5d26925f5e3010..d8b17e041b6fe696f95c0c26f2739a3c83a90a26 100644
--- a/tests/test_differentiation.cpp
+++ b/tests/test_differentiation.cpp
@@ -9,6 +9,7 @@
 #include "entity_data.h"
 #include "kernels_cpu.h"
 #include "timecounter.h"
+#include "silo_output.hpp"
 
 using namespace sgr;
 
@@ -16,7 +17,21 @@ static void
 make_geometry(int order, double mesh_h)
 {
     gm::add("difftest");
-    gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 1.0);
+
+    std::vector<std::pair<int,int>> objects;
+
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) )
+        );
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) )
+        );
+
+    std::vector<std::pair<int, int>> tools;
+    gmsh::vectorpair odt;
+    std::vector<gmsh::vectorpair> odtm;
+    gmo::fragment(objects, tools, odt, odtm);
+
     gmo::synchronize();
 
     gvp_t vp;
@@ -56,58 +71,108 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         make_geometry(0,h);
         model mod(geometric_order, approximation_order);
 
-        auto& e0 = mod.entity_at(0);
-        e0.sort_by_orientation();
-        vecxd Pf_e0 = e0.project(f);
-        vecxd df_dx_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dy_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dz_e0 = vecxd::Zero( Pf_e0.rows() );
-
-        entity_data_cpu ed;
-        e0.populate_entity_data(ed);
-
-        timecounter tc;
-        tc.tic();
-        compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
-        double time = tc.toc();
-
-        if (geometric_order == 1)
+        std::stringstream ss;
+        ss << "diff_go_" << geometric_order << "_ao_" << approximation_order;
+        ss << "_seq_" << i << ".silo";
+
+#ifdef WRITE_TEST_OUTPUTS
+        silo silodb;
+        silodb.create_db(ss.str());
+        silodb.import_mesh_from_gmsh();
+        silodb.write_mesh();
+#endif
+        auto model_num_dofs = mod.num_dofs();
+
+        vecxd Pf        = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dx    = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dy    = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dz    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dx    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dy    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dz    = vecxd::Zero(model_num_dofs);
+
+        std::vector<entity_data_cpu> eds;
+
+        for (auto& e : mod)
         {
-            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-            double flops = 21*ed.num_bf*ed.num_bf*e0.num_cells();
-            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            e.project(f, Pf);
+            e.project(df_dx, Pdf_dx);
+            e.project(df_dy, Pdf_dy);
+            e.project(df_dz, Pdf_dz);
+            entity_data_cpu ed;
+            e.populate_entity_data(ed);
+            eds.push_back( std::move(ed) );
         }
-        else
+
+        for (auto& ed : eds)
         {
-            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-            double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*e0.num_cells();
-            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            timecounter tc;
+            tc.tic();
+            compute_field_derivatives(ed, Pf, Cdf_dx, Cdf_dy, Cdf_dz);
+            double time = tc.toc();
+
+            auto num_cells = num_elems_all_orientations(ed);
+            if (geometric_order == 1)
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = 21*ed.num_bf*ed.num_bf*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
+            else
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
         }
-
-        vecxd Pdf_dx_e0 = e0.project(df_dx);
-        vecxd Pdf_dy_e0 = e0.project(df_dy);
-        vecxd Pdf_dz_e0 = e0.project(df_dz);
-
+        
         double err_x = 0.0;
         double err_y = 0.0;
         double err_z = 0.0;
 
-        for (size_t iT = 0; iT < e0.num_cells(); iT++)
+#ifdef WRITE_TEST_OUTPUTS
+        auto model_num_cells = mod.num_cells();
+        std::vector<double> var_Pf(model_num_cells);
+        std::vector<double> var_df_dx(model_num_cells);
+        std::vector<double> var_df_dy(model_num_cells);
+        std::vector<double> var_df_dz(model_num_cells);
+#endif
+
+        for (auto& e : mod)
         {
-            auto& pe = e0.cell(iT);
-            auto& re = e0.cell_refelem(pe);
-            matxd mass = e0.mass_matrix(iT);
-            auto num_bf = re.num_basis_functions();
-            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_e0.segment(iT*num_bf, num_bf); 
-        
-            err_x += diff_x.dot(mass*diff_x);
-            err_y += diff_y.dot(mass*diff_y);
-            err_z += diff_z.dot(mass*diff_z);
+            for (size_t iT = 0; iT < e.num_cells(); iT++)
+            {
+                auto& pe = e.cell(iT);
+                auto& re = e.cell_refelem(pe);
+                matxd mass = e.mass_matrix(iT);
+                auto num_bf = re.num_basis_functions();
+                auto ofs = e.cell_global_dof_offset(iT);
+                vecxd diff_x = Pdf_dx.segment(ofs, num_bf) - Cdf_dx.segment(ofs, num_bf); 
+                vecxd diff_y = Pdf_dy.segment(ofs, num_bf) - Cdf_dy.segment(ofs, num_bf); 
+                vecxd diff_z = Pdf_dz.segment(ofs, num_bf) - Cdf_dz.segment(ofs, num_bf); 
+            
+                err_x += diff_x.dot(mass*diff_x);
+                err_y += diff_y.dot(mass*diff_y);
+                err_z += diff_z.dot(mass*diff_z);
+
+#ifdef WRITE_TEST_OUTPUTS
+                auto gi = e.cell_global_index_by_gmsh(iT);
+                assert(gi < model_num_cells);
+                vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+                var_df_dx[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar);
+                var_df_dy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar);
+                var_df_dz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar);
+#endif
+            }
+            std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
+            std::cout << " " << std::sqrt(err_z) << std::endl;
         }
-        std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
-        std::cout << " " << std::sqrt(err_z) << std::endl;
+
+#ifdef WRITE_TEST_OUTPUTS
+        silodb.write_zonal_variable("df_dx", var_df_dx);
+        silodb.write_zonal_variable("df_dy", var_df_dy);
+        silodb.write_zonal_variable("df_dz", var_df_dz);
+#endif
     
         errs_x.push_back( std::sqrt(err_x) );
         errs_y.push_back( std::sqrt(err_y) );
diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp
index 46dbe740d28cf60022d71c484a629d10b22de299..4f77055272077170bd8f0e2c75374b195d29c73c 100644
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -9,6 +9,7 @@
 #include "entity_data.h"
 #include "kernels_cpu.h"
 #include "timecounter.h"
+#include "silo_output.hpp"
 
 #include "kernels_gpu.h"
 
@@ -18,7 +19,21 @@ static void
 make_geometry(int order, double mesh_h)
 {
     gm::add("difftest");
-    gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 1.0);
+
+    std::vector<std::pair<int,int>> objects;
+
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) )
+        );
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) )
+        );
+
+    std::vector<std::pair<int, int>> tools;
+    gmsh::vectorpair odt;
+    std::vector<gmsh::vectorpair> odtm;
+    gmo::fragment(objects, tools, odt, odtm);
+
     gmo::synchronize();
 
     gvp_t vp;
@@ -26,6 +41,8 @@ make_geometry(int order, double mesh_h)
     gmm::setSize(vp, mesh_h);
 }
 
+#define WRITE_TEST_OUTPUTS
+
 int test_differentiation_convergence(int geometric_order, int approximation_order)
 {
     std::vector<double> sizes({ 0.32, 0.16, 0.08, 0.04 });
@@ -58,73 +75,120 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         make_geometry(0,h);
         model mod(geometric_order, approximation_order);
 
-        auto& e0 = mod.entity_at(0);
-        e0.sort_by_orientation();
-        vecxd Pf_cpu = e0.project(f);
-
-        /* Make CPU entity data */
-        entity_data_cpu ed;
-        e0.populate_entity_data(ed);
-        /* Derive GPU entity data */
-        entity_data_gpu edg(ed);
-
-        /* Prepare I/O vectors and call kernel */
-        device_vector<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size());
-        device_vector<double> df_dx_gpu(Pf_cpu.size());
-        device_vector<double> df_dy_gpu(Pf_cpu.size());
-        device_vector<double> df_dz_gpu(Pf_cpu.size());
-
-        timecounter_gpu tc;
-        tc.tic();
-        gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(),
-            df_dy_gpu.data(), df_dz_gpu.data());
-        double time = tc.toc();
-
-        vecxd df_dx_cpu = vecxd::Zero( Pf_cpu.size() );
-        vecxd df_dy_cpu = vecxd::Zero( Pf_cpu.size() );
-        vecxd df_dz_cpu = vecxd::Zero( Pf_cpu.size() );
-
-        df_dx_gpu.copyout( df_dx_cpu.data() );
-        df_dy_gpu.copyout( df_dy_cpu.data() );
-        df_dz_gpu.copyout( df_dz_cpu.data() );
-
-        if (geometric_order == 1)
+        std::stringstream ss;
+        ss << "diff_go_" << geometric_order << "_ao_" << approximation_order;
+        ss << "_seq_" << i << ".silo";
+
+#ifdef WRITE_TEST_OUTPUTS
+        silo silodb;
+        silodb.create_db(ss.str());
+        silodb.import_mesh_from_gmsh();
+        silodb.write_mesh();
+#endif
+        auto model_num_dofs = mod.num_dofs();
+
+        vecxd Pf        = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dx    = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dy    = vecxd::Zero(model_num_dofs);
+        vecxd Pdf_dz    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dx    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dy    = vecxd::Zero(model_num_dofs);
+        vecxd Cdf_dz    = vecxd::Zero(model_num_dofs);
+
+        std::vector<entity_data_gpu> edgs;
+
+        for (auto& e : mod)
         {
-            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-            double flops = 21*ed.num_bf*ed.num_bf*e0.num_cells();
-            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            e.project(f, Pf);
+            e.project(df_dx, Pdf_dx);
+            e.project(df_dy, Pdf_dy);
+            e.project(df_dz, Pdf_dz);
+            entity_data_cpu ed;
+            e.populate_entity_data(ed);
+            entity_data_gpu edg(ed);
+            edgs.push_back( std::move(edg) );
         }
-        else
+
+         /* Prepare I/O vectors and call kernel */
+        device_vector<double> Pf_gpu(Pf.data(), Pf.size());
+        device_vector<double> df_dx_gpu(model_num_dofs);
+        device_vector<double> df_dy_gpu(model_num_dofs);
+        device_vector<double> df_dz_gpu(model_num_dofs);
+
+        for (auto& edg : edgs)
         {
-            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-            double flops = ((21*ed.num_bf+6)*ed.num_bf*ed.num_qp + 3*(2*ed.num_bf-1)*ed.num_bf)*e0.num_cells();
-            std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            timecounter_gpu tc;
+            tc.tic();
+            gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(),
+                df_dy_gpu.data(), df_dz_gpu.data());
+            double time = tc.toc();
+
+            auto num_cells = edg.num_all_elems;
+            if (geometric_order == 1)
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = 21*edg.num_bf*edg.num_bf*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
+            else
+            {
+                std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
+                double flops = ((21*edg.num_bf+6)*edg.num_bf/**edg.num_qp*/ + 3*(2*edg.num_bf-1)*edg.num_bf)*num_cells;
+                std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
+            }
         }
 
-        vecxd Pdf_dx_e0 = e0.project(df_dx);
-        vecxd Pdf_dy_e0 = e0.project(df_dy);
-        vecxd Pdf_dz_e0 = e0.project(df_dz);
-
+        df_dx_gpu.copyout( Cdf_dx.data() );
+        df_dy_gpu.copyout( Cdf_dy.data() );
+        df_dz_gpu.copyout( Cdf_dz.data() );
+        
         double err_x = 0.0;
         double err_y = 0.0;
         double err_z = 0.0;
 
-        for (size_t iT = 0; iT < e0.num_cells(); iT++)
+#ifdef WRITE_TEST_OUTPUTS
+        auto model_num_cells = mod.num_cells();
+        std::vector<double> var_Pf(model_num_cells);
+        std::vector<double> var_df_dx(model_num_cells);
+        std::vector<double> var_df_dy(model_num_cells);
+        std::vector<double> var_df_dz(model_num_cells);
+#endif
+
+        for (auto& e : mod)
         {
-            auto& pe = e0.cell(iT);
-            auto& re = e0.cell_refelem(pe);
-            matxd mass = e0.mass_matrix(iT);
-            auto num_bf = re.num_basis_functions();
-            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_cpu.segment(iT*num_bf, num_bf); 
-            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_cpu.segment(iT*num_bf, num_bf); 
-            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_cpu.segment(iT*num_bf, num_bf); 
-        
-            err_x += diff_x.dot(mass*diff_x);
-            err_y += diff_y.dot(mass*diff_y);
-            err_z += diff_z.dot(mass*diff_z);
+            for (size_t iT = 0; iT < e.num_cells(); iT++)
+            {
+                auto& pe = e.cell(iT);
+                auto& re = e.cell_refelem(pe);
+                matxd mass = e.mass_matrix(iT);
+                auto num_bf = re.num_basis_functions();
+                auto ofs = e.cell_global_dof_offset(iT);
+                vecxd diff_x = Pdf_dx.segment(ofs, num_bf) - Cdf_dx.segment(ofs, num_bf); 
+                vecxd diff_y = Pdf_dy.segment(ofs, num_bf) - Cdf_dy.segment(ofs, num_bf); 
+                vecxd diff_z = Pdf_dz.segment(ofs, num_bf) - Cdf_dz.segment(ofs, num_bf); 
+            
+                err_x += diff_x.dot(mass*diff_x);
+                err_y += diff_y.dot(mass*diff_y);
+                err_z += diff_z.dot(mass*diff_z);
+
+#ifdef WRITE_TEST_OUTPUTS
+                auto gi = e.cell_global_index_by_gmsh(iT);
+                assert(gi < model_num_cells);
+                vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+                var_df_dx[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar);
+                var_df_dy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar);
+                var_df_dz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar);
+#endif
+            }
+            std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
+            std::cout << " " << std::sqrt(err_z) << std::endl;
         }
-        std::cout << "Errors: " << std::sqrt(err_x) << " " << std::sqrt(err_y);
-        std::cout << " " << std::sqrt(err_z) << std::endl;
+
+#ifdef WRITE_TEST_OUTPUTS
+        silodb.write_zonal_variable("df_dx", var_df_dx);
+        silodb.write_zonal_variable("df_dy", var_df_dy);
+        silodb.write_zonal_variable("df_dz", var_df_dz);
+#endif
     
         errs_x.push_back( std::sqrt(err_x) );
         errs_y.push_back( std::sqrt(err_y) );
@@ -161,8 +225,8 @@ int main(void)
     int failed_tests = 0;
 
     std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
-    for (size_t go = 1; go < 2; go++)
-        for (size_t ao = go; ao < 7; ao++)
+    for (size_t go = 1; go < 5; go++)
+        for (size_t ao = go; ao < 5; ao++)
             failed_tests += test_differentiation_convergence(go, ao);
 
     return failed_tests;