diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1ece6ee8f84ea181a7dd770c7abd5d61eb98a3e..5e0068e5d4189093c358bf34c9987706d5815f1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,7 +147,7 @@ endif ()
 if (COMPILER_IS_CLANG)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Weverything -Wno-c++98-compat -Wno-c++98-compat-pedantic -Wno-zero-as-null-pointer-constant -Wno-global-constructors -Wno-padded -Wno-shorten-64-to-32 -Wno-sign-conversion -Wno-old-style-cast -Wno-sign-compare -Wno-c99-extensions -Wno-extra-semi-stmt")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wno-sign-compare")
 endif()
 set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g -fpermissive")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG -fpermissive")
@@ -243,6 +243,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 set(COMMON_SOURCES gmsh_io.cpp reference_element.cpp physical_element.cpp entity.cpp kernels_cpu.cpp connectivity.cpp entity_data.cpp)
 
 set(LIBGMSHDG_SOURCES src/gmsh_io.cpp
+                      src/silo_io.cpp
                       src/entity.cpp
                       src/reference_element.cpp
                       src/physical_element.cpp
@@ -251,11 +252,15 @@ set(LIBGMSHDG_SOURCES src/gmsh_io.cpp
                       src/kernels_cpu.cpp
                       src/kernels_gpu_mi.cpp
                       src/kernels_cuda.cu
+                      src/maxwell_interface.cpp
                     )
 
 add_library(gmshdg SHARED ${LIBGMSHDG_SOURCES})
 target_link_libraries(gmshdg ${LINK_LIBS})
 
+add_executable(maxwell_solver src/maxwell_solver.cpp)
+target_link_libraries(maxwell_solver gmshdg)
+
 add_executable(test_basics tests/test_basics.cpp)
 target_link_libraries(test_basics gmshdg)
 
diff --git a/include/entity.h b/include/entity.h
index ebf3997f849c6670d7c8ff4911b72d725ee77bda..46fa446a725b9819f6af2c2f70bd2772c161828e 100644
--- a/include/entity.h
+++ b/include/entity.h
@@ -8,6 +8,7 @@
 #include "entity_data.h"
 
 using scalar_function = std::function<double(const point_3d&)>;
+using vector_function = std::function<vec3d(const point_3d&)>;
 
 class entity
 {
@@ -59,8 +60,12 @@ public:
     /* add method to adjust offset in system!! */
 
     double                      measure(void) const;
+    
     vecxd                       project(const scalar_function&) const;
+    //matxd                       project(const vector_function&) const;
+
     void                        project(const scalar_function&, vecxd&) const;
+    void                        project(const vector_function&, vecxd&, vecxd&, vecxd&) const;
 
     size_t                      num_cells(void) const;
     size_t                      num_cell_orientations(void) const;
diff --git a/include/gmsh_io.h b/include/gmsh_io.h
index 12d730b222624bec51a4ce607484b8037f8b2dd5..60b0194ecdcd280e9a1bf595261412d5f371817e 100644
--- a/include/gmsh_io.h
+++ b/include/gmsh_io.h
@@ -60,14 +60,15 @@ public:
     size_t  num_dofs() const;
     size_t  num_fluxes() const;
     size_t  num_cells() const;
+    size_t  num_entities() const;
 
     std::vector<entity>::const_iterator begin() const;
     std::vector<entity>::const_iterator end() const;
     std::vector<entity>::iterator begin();
     std::vector<entity>::iterator end();
 
-    entity&     entity_at(size_t);
-    entity      entity_at(size_t) const;
+    entity&         entity_at(size_t);
+    const entity&   entity_at(size_t) const;
 
 };
 
diff --git a/include/gpu_support.hpp b/include/gpu_support.hpp
index 052243d3dec8382d010ae9c0378a2eae187ebc29..658c7a3315fd389297956ee2698742a246ef597b 100644
--- a/include/gpu_support.hpp
+++ b/include/gpu_support.hpp
@@ -148,82 +148,109 @@ fetch_tex(cudaTextureObject_t tex, int32_t ofs)
 template<typename T>
 class device_vector
 {
-    T       *d_vptr;
-    size_t  vsize;
+    T *     m_devptr;
+    size_t  m_size;
 
 public:
     device_vector()
-        : d_vptr(nullptr), vsize(0)
+        : m_devptr(nullptr), m_size(0)
     {}
 
     device_vector(size_t size)
-        : d_vptr(nullptr), vsize(0)
+        : m_devptr(nullptr), m_size(0)
     {
-        (void)checkGPU( gpuMalloc((void**)&d_vptr, size*sizeof(T)) );
-        (void)checkGPU( gpuMemset(d_vptr, 0, size*sizeof(T)) );
-        vsize = size;
+        (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
+        (void)checkGPU( gpuMemset(m_devptr, 0, size*sizeof(T)) );
+        m_size = size;
     }
 
-    /* allocate memory and copyin data at ptr */
     device_vector(const T *src, size_t size)
-        : d_vptr(nullptr), vsize(0)
+        : m_devptr(nullptr), m_size(0)
     {
         copyin(src, size);
     }
 
+    device_vector(const device_vector& other)
+    {
+        (void)checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
+        (void)checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
+        m_size = other.m_size;
+    }
+
+    device_vector(device_vector&& other)
+    {
+        m_devptr = other.m_devptr;
+        m_size = other.m_size;
+        other.m_devptr = nullptr;
+        other.m_size = 0;
+    }
+
     void copyin(const T *src, size_t size)
     {
-        if (d_vptr != nullptr)
-            (void) gpuFree(d_vptr);
+        if (m_devptr != nullptr)
+            (void) gpuFree(m_devptr);
 
-        (void)checkGPU( gpuMalloc((void**)&d_vptr, size*sizeof(T)) );
-        (void)checkGPU( gpuMemcpy(d_vptr, src, size*sizeof(T), gpuMemcpyHostToDevice) );
-        vsize = size;
+        (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
+        (void)checkGPU( gpuMemcpy(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice) );
+        m_size = size;
     }
 
     void copyout(T *dst)
     {
-        if (d_vptr == nullptr)
+        if (m_devptr == nullptr)
             return;
 
-        (void)checkGPU( gpuMemcpy(dst, d_vptr, vsize*sizeof(T), gpuMemcpyDeviceToHost) );
+        (void)checkGPU( gpuMemcpy(dst, m_devptr, m_size*sizeof(T), gpuMemcpyDeviceToHost) );
     }
 
     void resize(size_t size)
     {
-        if (d_vptr != nullptr)
-            (void) gpuFree(d_vptr);
+        if (size == m_size)
+            return;
 
-        (void)checkGPU( gpuMalloc((void**)&d_vptr, size*sizeof(T)) );
-        vsize = size;
-    }
+        if (m_devptr != nullptr)
+            (void) gpuFree(m_devptr);
 
-    device_vector(const device_vector&) = delete;
+        (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
+        m_size = size;
+    }
 
-    device_vector(device_vector&& other)
+    device_vector& operator=(const device_vector& other)
     {
-        d_vptr = other.d_vptr;
-        vsize  = other.vsize;
-        other.d_vptr = nullptr;
-        other.vsize = 0;
+        if ( (m_devptr != nullptr) or (m_size != other.m_size) )
+        {
+            (void) gpuFree(m_devptr);
+            (void) checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
+        }
+
+        (void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
+        m_size = other.m_size;
     }
 
-    device_vector& operator=(const device_vector&) = delete;
+    void zero(void)
+    {
+        (void) checkGPU( gpuMemset(m_devptr, 0, m_size*sizeof(T)) );
+    }
 
     T* data()
     {
-        return d_vptr;
+        return m_devptr;
+    }
+
+    const T* data() const
+    {
+        return m_devptr;
     }
 
     size_t size() const
     {
-        return vsize;
+        return m_size;
     }
 
     ~device_vector()
     {
-        if (d_vptr)
-            (void)checkGPU(gpuFree(d_vptr));
+        if (m_devptr)
+            (void)checkGPU(gpuFree(m_devptr));
     }
 };
 
diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h
index d8df2bfb2d16977367fe4b57b0f1b8da83ba0735..1c81c6d51719278d8f16cea65646973cb69b006f 100644
--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -143,8 +143,9 @@ void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&,
 
 
 void
-gpu_compute_field_derivatives(entity_data_gpu& edg,
-    gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz);
+gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F,
+    double *dF_dx, double* dF_dy, double* dF_dz, double alpha);
 
-void
-gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out);
\ No newline at end of file
+void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *);
+
+void gpu_subtract(double *, const double *, const double *, size_t);
\ No newline at end of file
diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..2481d886eec061ed99bb2d9e921520fd30497b4e
--- /dev/null
+++ b/include/maxwell_interface.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include "types.h"
+#include "gpu_support.hpp"
+
+struct electromagnetic_field
+{
+    vecxd   Ex;
+    vecxd   Ey;
+    vecxd   Ez;
+    vecxd   Hx;
+    vecxd   Hy;
+    vecxd   Hz;
+
+    size_t  num_dofs;
+
+    void    resize(size_t);
+};
+
+struct electromagnetic_material_params
+{
+    size_t  num_dofs;
+    size_t  num_fluxes;
+
+    vecxd   inv_epsilon;
+    vecxd   inv_mu;
+    vecxd   sigma;
+    vecxd   sigma_over_epsilon;
+
+    /* Lax-Milgram flux coefficients */
+    vecxd   aE;
+    vecxd   bE;
+    vecxd   aH;
+    vecxd   bH;
+
+    /* Boundary conditions coefficients */
+    vecxd   bc_coeffs;
+};
+
+struct electromagnetic_material_params_gpu
+{
+    size_t  num_dofs;
+    size_t  num_fluxes;
+
+    device_vector<double>   inv_epsilon;
+    device_vector<double>   inv_mu;
+    device_vector<double>   sigma;
+    device_vector<double>   sigma_over_epsilon;
+
+    /* Lax-Milgram flux coefficients */
+    device_vector<double>   aE;
+    device_vector<double>   bE;
+    device_vector<double>   aH;
+    device_vector<double>   bH;
+
+    /* Boundary conditions coefficients */
+    device_vector<double>   bc_coeffs;
+};
+
+struct electromagnetic_field_gpu
+{
+    device_vector<double>   Ex;
+    device_vector<double>   Ey;
+    device_vector<double>   Ez;
+    device_vector<double>   Hx;
+    device_vector<double>   Hy;
+    device_vector<double>   Hz;
+
+    size_t num_dofs;
+
+    struct raw_ptrs
+    {
+        size_t  num_dofs;
+
+        double  *Ex;
+        double  *Ey;
+        double  *Ez;
+        double  *Hx;
+        double  *Hy;
+        double  *Hz;
+    };
+
+    void        zero(void);
+    void        resize(size_t);
+    raw_ptrs    data(void);
+    void        copyin(const electromagnetic_field&);
+    void        copyout(electromagnetic_field&);
+
+};
\ No newline at end of file
diff --git a/include/silo_output.hpp b/include/silo_output.hpp
index 52fac0cb7db4497ab51a09240b53a3eed0d9f139..092f443b8fae5d26f879e497b115e503a2c7dd4e 100644
--- a/include/silo_output.hpp
+++ b/include/silo_output.hpp
@@ -1,10 +1,7 @@
 #pragma once
 
-#include <cassert>
+#include <vector>
 #include <silo.h>
-#include "gmsh.h"
-
-#define INVALID_OFS ((size_t) (~0))
 
 class silo
 {
@@ -16,171 +13,21 @@ class silo
     std::vector<int>        nodelist;
     int                     num_cells;
 
-    DBfile                  *m_siloDb;
-
-    void gmsh_get_nodes()
-    {
-        std::vector<size_t>     nodeTags;
-        std::vector<double>     coords;
-        std::vector<double>     paraCoords;
-
-        gmsh::model::mesh::getNodes(nodeTags, coords, paraCoords, -1, -1, true, false);
-
-        auto maxtag_pos = std::max_element(nodeTags.begin(), nodeTags.end());
-        
-        size_t maxtag = 0;
-        if (maxtag_pos != nodeTags.end())
-            maxtag = (*maxtag_pos)+1;
-
-        node_coords_x.resize( nodeTags.size() );
-        node_coords_y.resize( nodeTags.size() );
-        node_coords_z.resize( nodeTags.size() );
-
-        for (size_t i = 0; i < nodeTags.size(); i++)
-        {
-            node_coords_x[i] = coords[3*i+0];
-            node_coords_y[i] = coords[3*i+1];
-            node_coords_z[i] = coords[3*i+2];
-        }
-
-        node_tag2ofs.resize(maxtag, INVALID_OFS);
-        for (size_t i = 0; i < nodeTags.size(); i++)
-            node_tag2ofs.at( nodeTags[i] ) = i;
-    }
-
-    void gmsh_get_elements()
-    {
-        gmsh::vectorpair entities;
-        num_cells = 0;
-        gmsh::model::getEntities(entities, 3);
-        for (auto [dim, tag] : entities)
-        {
-            std::vector<int> elemTypes;
-            gmsh::model::mesh::getElementTypes(elemTypes, dim, tag);
-            for (auto& elemType : elemTypes)
-            {
-                std::vector<size_t> elemTags;
-                std::vector<size_t> elemNodeTags;
-                gmsh::model::mesh::getElementsByType(elemType, elemTags, elemNodeTags, tag);
-                auto nodesPerElem = elemNodeTags.size()/elemTags.size();
-                assert( elemTags.size() * nodesPerElem == elemNodeTags.size() );
-            
-                for (size_t i = 0; i < elemTags.size(); i++)
-                {
-                    auto base = nodesPerElem * i;
+    DBfile *                m_siloDb;
 
-                    auto node0_tag = elemNodeTags[base + 0];
-                    assert(node0_tag < node_tag2ofs.size());
-                    auto node0_ofs = node_tag2ofs[node0_tag];
-                    assert(node0_ofs != INVALID_OFS);
-                    nodelist.push_back(node0_ofs + 1);
-
-                    auto node1_tag = elemNodeTags[base + 1];
-                    assert(node1_tag < node_tag2ofs.size());
-                    auto node1_ofs = node_tag2ofs[node1_tag];
-                    assert(node1_ofs != INVALID_OFS);
-                    nodelist.push_back(node1_ofs + 1);
-
-                    auto node2_tag = elemNodeTags[base + 2];
-                    assert(node2_tag < node_tag2ofs.size());
-                    auto node2_ofs = node_tag2ofs[node2_tag];
-                    assert(node2_ofs != INVALID_OFS);
-                    nodelist.push_back(node2_ofs + 1);
-
-                    auto node3_tag = elemNodeTags[base + 3];
-                    assert(node3_tag < node_tag2ofs.size());
-                    auto node3_ofs = node_tag2ofs[node3_tag];
-                    assert(node3_ofs != INVALID_OFS);
-                    nodelist.push_back(node3_ofs + 1);
-                }
-
-                num_cells += elemTags.size();
-            }
-        }
-
-        assert(nodelist.size() == 4*num_cells);
-    }
+    void    gmsh_get_nodes(void);
+    void    gmsh_get_elements(void);
 
 public:
-    silo()
-    {}
-
-    void import_mesh_from_gmsh()
-    {
-        gmsh_get_nodes();
-        gmsh_get_elements();        
-    }
-
-    bool create_db(const std::string& dbname)
-    {
-        m_siloDb = DBCreate(dbname.c_str(), DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
-        if (m_siloDb)
-            return true;
-
-        std::cout << "SILO: Error creating database" << std::endl;
-        return false;
-    }
-
-    bool write_mesh(double time = -1.0, int cycle = 0)
-    {
-        DBoptlist   *optlist = nullptr;
-
-        if (time >= 0)
-        {
-            optlist = DBMakeOptlist(2);
-            DBAddOption(optlist, DBOPT_CYCLE, &cycle);
-            DBAddOption(optlist, DBOPT_DTIME, &time);
-        }
-
-        double *coords[] = {node_coords_x.data(), node_coords_y.data(), node_coords_z.data()};
-
-        int lnodelist = nodelist.size();
-
-        int shapetype[] = { DB_ZONETYPE_TET };
-        int shapesize[] = {4};
-        int shapecounts[] = { num_cells };
-        int nshapetypes = 1;
-        int nnodes = node_coords_x.size();
-        int nzones = num_cells;
-        int ndims = 3;
-
-        DBPutZonelist2(m_siloDb, "zonelist", nzones, ndims,
-            nodelist.data(), lnodelist, 1, 0, 0, shapetype, shapesize,
-            shapecounts, nshapetypes, NULL);
-
-        DBPutUcdmesh(m_siloDb, "mesh", ndims, NULL, coords, nnodes, nzones,
-            "zonelist", NULL, DB_DOUBLE, optlist);
-
-        if (optlist)
-            DBFreeOptlist(optlist);
-
-        return true;
-    }
-
-    bool write_zonal_variable(const std::string& name, const std::vector<double>& data)
-    {
-        if (data.size() != num_cells)
-        {
-            std::cout << "Invalid number of cells" << std::endl;
-            return false;
-        }
-
-        DBPutUcdvar1(m_siloDb, name.c_str(), "mesh", data.data(), data.size(), NULL,
-                     0, DB_DOUBLE, DB_ZONECENT, NULL);
-        
-        return true;
-    }
-
-    void close_db()
-    {
-        if (m_siloDb)
-            DBClose(m_siloDb);
-
-        m_siloDb = nullptr;
-    }
-
-    ~silo()
-    {
-        close_db();
-    }
+    silo();
+    ~silo();
+
+    void    import_mesh_from_gmsh();
+    bool    create_db(const std::string&);
+    bool    write_mesh(double time = -1.0, int cycle = 0);
+    bool    write_zonal_variable(const std::string&, const std::vector<double>&);
+    bool    write_zonal_variable(const std::string&, const std::vector<float>&);
+    bool    write_scalar_expression(const std::string& name, const std::string& expression);
+    bool    write_vector_expression(const std::string& name, const std::string& expression);
+    void    close_db();
 };
diff --git a/src/entity.cpp b/src/entity.cpp
index d6801d4006661d498773a8fcf4a763611f6eb895..aa68961024c99036c7317db5785cc65b9a17a632 100644
--- a/src/entity.cpp
+++ b/src/entity.cpp
@@ -126,6 +126,45 @@ entity::project(const scalar_function& function, vecxd& pf) const
     }
 }
 
+/* Project a function on this entity and store the partial result in
+ * the global vector pf at the offset corresponding to this entity. */
+void
+entity::project(const vector_function& function, vecxd& pfx, vecxd& pfy, vecxd& pfz) const
+{
+    auto num_bf = reference_cells[0].num_basis_functions();
+
+    for (size_t iT = 0; iT < physical_cells.size(); iT++)
+    {
+        const auto& pe = physical_cells[iT];
+        assert( pe.orientation() < reference_cells.size() );
+        const auto& re = reference_cells[pe.orientation()];
+        const auto pqps = pe.integration_points();
+        const auto dets = pe.determinants();
+
+        assert(pqps.size() == dets.size());
+
+        matxd mass = matxd::Zero(num_bf, num_bf);
+        vecxd rhs_x = vecxd::Zero(num_bf);
+        vecxd rhs_y = vecxd::Zero(num_bf);
+        vecxd rhs_z = vecxd::Zero(num_bf);
+        for (size_t iQp = 0; iQp < pqps.size(); iQp++)
+        {
+            const auto& pqp = pqps[iQp];
+            const vecxd phi = re.basis_functions(iQp);
+            mass += pqp.weight() * phi * phi.transpose();
+            vec3d fval = function(pqp.point());
+            rhs_x += pqp.weight() * fval(0) * phi; 
+            rhs_y += pqp.weight() * fval(1) * phi; 
+            rhs_z += pqp.weight() * fval(2) * phi; 
+        }
+
+        Eigen::LDLT<matxd> mass_ldlt(mass);
+        pfx.segment(m_dof_base + iT*num_bf, num_bf) = mass_ldlt.solve(rhs_x);
+        pfy.segment(m_dof_base + iT*num_bf, num_bf) = mass_ldlt.solve(rhs_y);
+        pfz.segment(m_dof_base + iT*num_bf, num_bf) = mass_ldlt.solve(rhs_z);
+    }
+}
+
 const reference_element&
 entity::cell_refelem(size_t iO) const
 {
diff --git a/src/gmsh_io.cpp b/src/gmsh_io.cpp
index 7009681dce42cb49332b9db161adfa02e29dfee3..e6022c2e17bbaa1b23b7edc02942c08f1dedb18c 100644
--- a/src/gmsh_io.cpp
+++ b/src/gmsh_io.cpp
@@ -117,6 +117,12 @@ model::entity_at(size_t pos)
     return entities.at(pos);
 }
 
+const entity&
+model::entity_at(size_t pos) const
+{
+    return entities.at(pos);
+}
+
 size_t
 model::num_dofs(void) const
 {
@@ -147,6 +153,12 @@ model::num_cells(void) const
     return ret;
 }
 
+size_t
+model::num_entities(void) const
+{
+    return entities.size();
+}
+
 void
 model::update_connectivity(const entity& e, size_t entity_index)
 {
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 667d480c5b764d2ae0c24da934457e9a2ffe34cc..da11465929dad3538f57be27107f8072254f8461 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -1,56 +1,59 @@
 #include "entity_data.h"
 #include "kernels_gpu.h"
 
+/* compute α∇F */
 template<size_t K>
 __global__ void
-gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J,
+gpu_deriv_planar(const double *F, const double * __restrict__ J,
     gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
     double * __restrict__ dF_dy, double * __restrict__ dF_dz,
-    int32_t num_all_elems, int32_t* orients, int32_t dof_base)
+    double alpha, int32_t num_all_elems, const int32_t* orients,
+    int32_t dof_base)
 {
     using KS = kernel_gpu_sizes<K>;
 
-    int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
+    const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
     if (ofs_in_entity >= num_all_elems*KS::num_bf)
         return;
 
-    int32_t output_dof_offset = dof_base + ofs_in_entity;
+    const int32_t output_dof_offset = dof_base + ofs_in_entity;
 
-    int32_t iT = ofs_in_entity / KS::num_bf;
-    int32_t elem_dof_base = dof_base + iT*KS::num_bf;
-    int32_t iO = orients[iT];
-    int32_t DM_row = ofs_in_entity % KS::num_bf;
-    int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t iT = ofs_in_entity / KS::num_bf;
+    const int32_t elem_dof_base = dof_base + iT*KS::num_bf;
+    const int32_t iO = orients[iT];
+    const int32_t DM_row = ofs_in_entity % KS::num_bf;
+    const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t DM_base = DM_orient + DM_row;
+    const int32_t jac_ofs = 9*iT;
 
     double accm_dF_dx = 0.0;
     double accm_dF_dy = 0.0;
     double accm_dF_dz = 0.0;
-    int32_t jac_ofs = 9*iT;
     for (int32_t dof = 0; dof < KS::num_bf; dof++)
     {
-        int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof;
+        int32_t d_ofs = DM_base + 3*KS::num_bf*dof;
         int32_t f_ofs = elem_dof_base + dof;
-        double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        double v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+0] * v;
         accm_dF_dy += J[jac_ofs+3] * v;
         accm_dF_dz += J[jac_ofs+6] * v;
     
-        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf;
-        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+1] * v;
         accm_dF_dy += J[jac_ofs+4] * v;
         accm_dF_dz += J[jac_ofs+7] * v;
     
-        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
-        v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
+        d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+2] * v;
         accm_dF_dy += J[jac_ofs+5] * v;
         accm_dF_dz += J[jac_ofs+8] * v;
     }
 
-    dF_dx[output_dof_offset] = accm_dF_dx;
-    dF_dy[output_dof_offset] = accm_dF_dy;
-    dF_dz[output_dof_offset] = accm_dF_dz;
+    dF_dx[output_dof_offset] = alpha*accm_dF_dx;
+    dF_dy[output_dof_offset] = alpha*accm_dF_dy;
+    dF_dz[output_dof_offset] = alpha*accm_dF_dz;
 }
 
 template<size_t K>
@@ -64,43 +67,44 @@ gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
 
     if (threadIdx.x >= KS::dofs_per_dblock)
         return;
-
-    int32_t elem_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::cells_per_dblock;
-    int32_t elem_ofs = threadIdx.x / KS::num_bf;
-    int32_t iT = elem_base + elem_ofs;
+    const int32_t y_idx = (blockIdx.y * blockDim.y + threadIdx.y);
+    const int32_t elem_base = y_idx * KS::cells_per_dblock;
+    const int32_t elem_ofs = threadIdx.x / KS::num_bf;
+    const int32_t iT = elem_base + elem_ofs;
 
     if (iT >= num_all_elems)
         return;
     
-    int32_t block_dof_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::dblock_size;
+    const int32_t block_dof_base = y_idx * KS::dblock_size;
 
-    int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
+    const int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
 
-    int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
-    int32_t iO = orients[iT];
-    int32_t DM_row = threadIdx.x % KS::num_bf;
-    int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
+    const int32_t iO = orients[iT];
+    const int32_t DM_row = threadIdx.x % KS::num_bf;
+    const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
+    const int32_t DM_base = DM_orient + DM_row;
+    const int32_t jac_ofs = 9*iT;
 
     double accm_dF_dx = 0.0;
     double accm_dF_dy = 0.0;
     double accm_dF_dz = 0.0;
-    int32_t jac_ofs = 9*iT;
     for (int32_t dof = 0; dof < KS::num_bf; dof++)
     {
-        int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof;
+        int32_t d_ofs = DM_base + 3*KS::num_bf*dof;
         int32_t f_ofs = elem_dof_base + dof;
         double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+0] * v;
         accm_dF_dy += J[jac_ofs+3] * v;
         accm_dF_dz += J[jac_ofs+6] * v;
     
-        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf;
+        d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf;
         v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+1] * v;
         accm_dF_dy += J[jac_ofs+4] * v;
         accm_dF_dz += J[jac_ofs+7] * v;
     
-        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
+        d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf;
         v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
         accm_dF_dx += J[jac_ofs+2] * v;
         accm_dF_dy += J[jac_ofs+5] * v;
@@ -114,8 +118,8 @@ gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
 
 template<size_t K>
 void
-launch_deriv_kernel(entity_data_gpu& edg,
-    gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz)
+launch_deriv_kernel(const entity_data_gpu& edg,
+    const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha)
 {
     auto J = edg.jacobians.data();
     auto Dtex = edg.differentiation_matrices.get_texture();
@@ -144,42 +148,42 @@ launch_deriv_kernel(entity_data_gpu& edg,
 
     if (edg.g_order == 1)
         gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads>>>(f, J,
-            Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
+            Dtex, df_dx, df_dy, df_dz, alpha, num_elems, orients, edg.dof_base);
     //else
     //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 #endif
 }
 
 void
-gpu_compute_field_derivatives(entity_data_gpu& edg,
-    gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz)
+gpu_compute_field_derivatives(const entity_data_gpu& edg,
+    const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha)
 {
     
 
     switch (edg.a_order)
     {
         case 1:
-            launch_deriv_kernel<1>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<1>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
 
         case 2:
-            launch_deriv_kernel<2>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<2>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
     
         case 3:
-            launch_deriv_kernel<3>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<3>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
 
         case 4:
-            launch_deriv_kernel<4>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<4>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
       
         case 5:
-            launch_deriv_kernel<5>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<5>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
 
         case 6:
-            launch_deriv_kernel<6>(edg, f, df_dx, df_dy, df_dz);
+            launch_deriv_kernel<6>(edg, f, df_dx, df_dy, df_dz, alpha);
             break;
 
         default:
@@ -193,6 +197,12 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
     const double * __restrict__ dets, double * __restrict__ lifted_flux,
     int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
 {
+    /* This kernel saturates the texture cache bandwidth (~1 TB/s on K20x!!)
+     * and thus it does not achieve good performance. This happens because
+     * the lifting matrix entries are not reused sufficiently. Sufficient
+     * reuse should be achieved by using a single entry to multiply different
+     * rhs instead of only one at a time. Orientations however make the
+     * implementation complicated, so for now the kernel remains slow. */
     using KS = kernel_gpu_sizes<K>;
 
     /* One thread per *output* dof */
@@ -277,3 +287,23 @@ gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out)
             throw std::invalid_argument("compute_field_derivatives: invalid order");
     }
 }
+
+void __global__
+gpu_subtract_kernel(double * __restrict__ dst, const double * __restrict__ add,
+    const double * __restrict__ sub, size_t num_dofs)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+        dst[dof] = add[dof] - sub[dof];
+}
+
+void
+gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs)
+{
+    static const size_t SUBTRACT_THREADS = 256;
+    auto gs = num_dofs/SUBTRACT_THREADS;
+    if (num_dofs % SUBTRACT_THREADS != 0)
+        gs += 1;
+    
+    gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs);
+}
\ No newline at end of file
diff --git a/src/maxwell_interface.cpp b/src/maxwell_interface.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..76b732ca6389bff7943f8edbbf8a4f1e95c115f0
--- /dev/null
+++ b/src/maxwell_interface.cpp
@@ -0,0 +1,77 @@
+#include "maxwell_interface.h"
+
+void
+electromagnetic_field::resize(size_t p_num_dofs)
+{
+    Ex = vecxd::Zero(p_num_dofs);
+    Ey = vecxd::Zero(p_num_dofs);
+    Ez = vecxd::Zero(p_num_dofs);
+    Hx = vecxd::Zero(p_num_dofs);
+    Hy = vecxd::Zero(p_num_dofs);
+    Hz = vecxd::Zero(p_num_dofs);
+    num_dofs = p_num_dofs;
+}
+
+void
+electromagnetic_field_gpu::zero()
+{
+    Ex.zero();
+    Ey.zero();
+    Ez.zero();
+    Hx.zero();
+    Hy.zero();
+    Hz.zero();
+}
+
+void
+electromagnetic_field_gpu::resize(size_t p_num_dofs)
+{
+    Ex.resize(p_num_dofs);
+    Ey.resize(p_num_dofs);
+    Ez.resize(p_num_dofs);
+    Hx.resize(p_num_dofs);
+    Hy.resize(p_num_dofs);
+    Hz.resize(p_num_dofs);
+    num_dofs = p_num_dofs;
+}
+
+electromagnetic_field_gpu::raw_ptrs
+electromagnetic_field_gpu::data(void)
+{
+    raw_ptrs ret;
+
+    ret.num_dofs = num_dofs;
+    ret.Ex = Ex.data();
+    ret.Ey = Ey.data();
+    ret.Ez = Ez.data();
+    ret.Hx = Hx.data();
+    ret.Hy = Hy.data();
+    ret.Hz = Hz.data();
+
+    return ret;
+}
+
+void
+electromagnetic_field_gpu::copyin(const electromagnetic_field& emf)
+{
+    Ex.copyin(emf.Ex.data(), emf.Ex.size());
+    Ey.copyin(emf.Ey.data(), emf.Ey.size());
+    Ez.copyin(emf.Ez.data(), emf.Ez.size());
+    Hx.copyin(emf.Hx.data(), emf.Hx.size());
+    Hy.copyin(emf.Hy.data(), emf.Hy.size());
+    Hz.copyin(emf.Hz.data(), emf.Hz.size());
+}
+
+void
+electromagnetic_field_gpu::copyout(electromagnetic_field& emf)
+{
+    if (num_dofs != emf.num_dofs)
+        emf.resize(num_dofs);
+
+    Ex.copyout(emf.Ex.data());
+    Ey.copyout(emf.Ey.data());
+    Ez.copyout(emf.Ez.data());
+    Hx.copyout(emf.Hx.data());
+    Hy.copyout(emf.Hy.data());
+    Hz.copyout(emf.Hz.data());
+}
\ No newline at end of file
diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca3b15b78c7cc7012948a9a623131214b0ca3350
--- /dev/null
+++ b/src/maxwell_solver.cpp
@@ -0,0 +1,317 @@
+#include <iostream>
+
+#include "gmsh_io.h"
+#include "maxwell_interface.h"
+#include "kernels_cpu.h"
+#include "kernels_gpu.h"
+#include "silo_output.hpp"
+
+static void
+make_geometry(int order, double mesh_h)
+{
+    gm::add("difftest");
+
+    std::vector<std::pair<int,int>> objects;
+
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.0, 1.0, 1.0, 0.5) )
+        );
+    objects.push_back(
+        std::make_pair(3, gmo::addBox(0.0, 0.0, 0.5, 0.5, 0.5, 0.5) )
+        );
+
+    std::vector<std::pair<int, int>> tools;
+    gmsh::vectorpair odt;
+    std::vector<gmsh::vectorpair> odtm;
+    gmo::fragment(objects, tools, odt, odtm);
+
+    gmo::synchronize();
+
+    gvp_t vp;
+    gm::getEntities(vp);
+    gmm::setSize(vp, mesh_h);
+}
+
+struct solver_state
+{
+    std::vector<entity_data_cpu>        eds;
+    electromagnetic_field               emf_curr;
+    electromagnetic_field               emf_next;
+    electromagnetic_material_params     matparams;
+
+    double                              delta_t;
+    double                              curr_time;
+    size_t                              curr_timestep;
+
+    electromagnetic_field               dx;
+    electromagnetic_field               dy;
+    electromagnetic_field               dz;
+};
+
+struct solver_state_gpu
+{
+    std::vector<entity_data_cpu>        eds;
+    std::vector<entity_data_gpu>        edgs;
+    electromagnetic_field_gpu           emf_curr;
+    electromagnetic_field_gpu           emf_next;
+    electromagnetic_material_params     matparams;
+
+    double                              delta_t;
+    double                              curr_time;
+    size_t                              curr_timestep;
+
+    electromagnetic_field_gpu           dx;
+    electromagnetic_field_gpu           dy;
+    electromagnetic_field_gpu           dz;
+};
+
+void init_from_model(const model& mod, solver_state& state)
+{
+    state.emf_curr.resize( mod.num_dofs() );
+    state.emf_next.resize( mod.num_dofs() );
+    state.dx.resize( mod.num_dofs() );
+    state.dy.resize( mod.num_dofs() );
+    state.dz.resize( mod.num_dofs() );
+
+    auto E = [](const point_3d& pt) -> vec3d {
+        vec3d ret;
+        ret(0) = std::sin(M_PI * pt.y());
+        ret(1) = 0;
+        ret(2) = 0;
+        return ret;
+    };
+
+    for (auto& e : mod)
+    {
+        entity_data_cpu ed;
+        e.populate_entity_data(ed);
+        state.eds.push_back( std::move(ed) );
+        e.project(E, state.emf_curr.Ex, state.emf_curr.Ey, state.emf_curr.Ez);
+    }
+}
+
+void init_from_model(const model& mod, solver_state_gpu& state)
+{
+    state.emf_curr.resize( mod.num_dofs() );
+    state.emf_next.resize( mod.num_dofs() );
+    state.dx.resize( mod.num_dofs() );
+    state.dy.resize( mod.num_dofs() );
+    state.dz.resize( mod.num_dofs() );
+
+    electromagnetic_field emf_io;
+    emf_io.resize( mod.num_dofs() );
+
+    auto E = [](const point_3d& pt) -> vec3d {
+        vec3d ret;
+        ret(0) = std::sin(M_PI * pt.y());
+        ret(1) = 0;
+        ret(2) = 0;
+        return ret;
+    };
+
+
+    for (auto& e : mod)
+    {
+        entity_data_cpu ed;
+        e.populate_entity_data(ed);
+        e.project(E, emf_io.Ex, emf_io.Ey, emf_io.Ez);
+        entity_data_gpu edg(ed);
+        state.eds.push_back( std::move(ed) );
+        state.edgs.push_back( std::move(edg) );
+    }
+
+    state.emf_curr.copyin(emf_io);
+}
+
+void compute_curls(solver_state& state, const electromagnetic_field& curr,
+    electromagnetic_field& next)
+{
+    for (const auto& ed : state.eds)
+    {
+        compute_field_derivatives(ed, curr.Ex, state.dx.Ex, state.dy.Ex, state.dz.Ex);
+        compute_field_derivatives(ed, curr.Ey, state.dx.Ey, state.dy.Ey, state.dz.Ey);
+        compute_field_derivatives(ed, curr.Ez, state.dx.Ez, state.dy.Ez, state.dz.Ez);
+        compute_field_derivatives(ed, curr.Hx, state.dx.Hx, state.dy.Hx, state.dz.Hx);
+        compute_field_derivatives(ed, curr.Hy, state.dx.Hy, state.dy.Hy, state.dz.Hy);
+        compute_field_derivatives(ed, curr.Hz, state.dx.Hz, state.dy.Hz, state.dz.Hz);
+    }
+
+    next.Ex = state.dy.Hz - state.dz.Hy;
+    next.Ey = state.dz.Hx - state.dx.Hz;
+    next.Ez = state.dx.Hy - state.dy.Hx;
+    next.Hx = state.dz.Ey - state.dy.Ez;
+    next.Hy = state.dx.Ez - state.dz.Ex;
+    next.Hz = state.dy.Ex - state.dx.Ey;
+}
+
+void compute_curls(solver_state_gpu& state, const electromagnetic_field_gpu& curr,
+    electromagnetic_field_gpu& next)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+    auto dxp = state.dx.data();
+    auto dyp = state.dy.data();
+    auto dzp = state.dz.data();
+
+    for (const auto& edg : state.edgs)
+    {
+        gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0);
+        gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0);
+        gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0);
+        gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0);
+        gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0);
+        gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0);
+    }
+
+    
+    gpu_subtract(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs);
+    gpu_subtract(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs);
+    gpu_subtract(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs);
+    gpu_subtract(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs);
+    gpu_subtract(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs);
+    gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs);
+}
+
+void apply_operator(solver_state& state, const electromagnetic_field& curr,
+    electromagnetic_field& next)
+{
+    compute_curls(state, curr, next);
+}
+
+void apply_operator(solver_state_gpu& state, const electromagnetic_field_gpu& curr,
+    electromagnetic_field_gpu& next)
+{
+    compute_curls(state, curr, next);
+}
+
+void
+export_to_silo(const model& mod, const solver_state& state, const std::string& filename)
+{
+    silo sdb;
+    sdb.create_db(filename);
+    sdb.import_mesh_from_gmsh();
+    sdb.write_mesh();
+
+    std::vector<double> curr_Ex; curr_Ex.resize( mod.num_cells() );
+    std::vector<double> curr_Ey; curr_Ey.resize( mod.num_cells() );
+    std::vector<double> curr_Ez; curr_Ez.resize( mod.num_cells() );
+    std::vector<double> curr_Hx; curr_Hx.resize( mod.num_cells() );
+    std::vector<double> curr_Hy; curr_Hy.resize( mod.num_cells() );
+    std::vector<double> curr_Hz; curr_Hz.resize( mod.num_cells() );
+
+    std::vector<double> next_Ex; next_Ex.resize( mod.num_cells() );
+    std::vector<double> next_Ey; next_Ey.resize( mod.num_cells() );
+    std::vector<double> next_Ez; next_Ez.resize( mod.num_cells() );
+    std::vector<double> next_Hx; next_Hx.resize( mod.num_cells() );
+    std::vector<double> next_Hy; next_Hy.resize( mod.num_cells() );
+    std::vector<double> next_Hz; next_Hz.resize( mod.num_cells() );
+
+    for (size_t iE = 0; iE < mod.num_entities(); iE++)
+    {
+        auto& e = mod.entity_at(iE);
+        for (size_t iT = 0; iT < e.num_cells(); iT++)
+        {
+            auto& pe = e.cell(iT);
+            auto& re = e.cell_refelem(pe);
+            vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+            auto num_bf = re.num_basis_functions();
+            auto ofs = e.cell_global_dof_offset(iT);
+            auto gi = e.cell_global_index_by_gmsh(iT);
+            curr_Ex[gi] = state.emf_curr.Ex.segment(ofs, num_bf).dot(phi_bar);
+            curr_Ey[gi] = state.emf_curr.Ey.segment(ofs, num_bf).dot(phi_bar);
+            curr_Ez[gi] = state.emf_curr.Ez.segment(ofs, num_bf).dot(phi_bar);
+            curr_Hx[gi] = state.emf_curr.Hx.segment(ofs, num_bf).dot(phi_bar);
+            curr_Hy[gi] = state.emf_curr.Hy.segment(ofs, num_bf).dot(phi_bar);
+            curr_Hz[gi] = state.emf_curr.Hz.segment(ofs, num_bf).dot(phi_bar);
+            next_Ex[gi] = state.emf_next.Ex.segment(ofs, num_bf).dot(phi_bar);
+            next_Ey[gi] = state.emf_next.Ey.segment(ofs, num_bf).dot(phi_bar);
+            next_Ez[gi] = state.emf_next.Ez.segment(ofs, num_bf).dot(phi_bar);
+            next_Hx[gi] = state.emf_next.Hx.segment(ofs, num_bf).dot(phi_bar);
+            next_Hy[gi] = state.emf_next.Hy.segment(ofs, num_bf).dot(phi_bar);
+            next_Hz[gi] = state.emf_next.Hz.segment(ofs, num_bf).dot(phi_bar);
+        }
+    }
+
+    sdb.write_zonal_variable("curr_Ex", curr_Ex);
+    sdb.write_zonal_variable("curr_Ey", curr_Ey);
+    sdb.write_zonal_variable("curr_Ez", curr_Ez);
+    sdb.write_zonal_variable("curr_Hx", curr_Hx);
+    sdb.write_zonal_variable("curr_Hy", curr_Hy);
+    sdb.write_zonal_variable("curr_Hz", curr_Hz);
+    //sdb.write_vector_expression("curr_E", "{curr_Ex, curr_Ey, curr_Ez}");
+    //sdb.write_vector_expression("curr_H", "{curr_Hx, curr_Hy, curr_Hz}");
+    sdb.write_zonal_variable("next_Ex", next_Ex);
+    sdb.write_zonal_variable("next_Ey", next_Ey);
+    sdb.write_zonal_variable("next_Ez", next_Ez);
+    sdb.write_zonal_variable("next_Hx", next_Hx);
+    sdb.write_zonal_variable("next_Hy", next_Hy);
+    sdb.write_zonal_variable("next_Hz", next_Hz);
+    //sdb.write_vector_expression("next_E", "{next_Ex, next_Ey, next_Ez}");
+    //sdb.write_vector_expression("next_H", "{next_Hx, next_Hy, next_Hz}");
+}
+
+void
+export_to_silo(const model& mod, const electromagnetic_field& emf,
+    const std::string& filename)
+{
+    silo sdb;
+    sdb.create_db(filename);
+    sdb.import_mesh_from_gmsh();
+    sdb.write_mesh();
+
+    std::vector<double> Ex; Ex.resize( mod.num_cells() );
+    std::vector<double> Ey; Ey.resize( mod.num_cells() );
+    std::vector<double> Ez; Ez.resize( mod.num_cells() );
+    std::vector<double> Hx; Hx.resize( mod.num_cells() );
+    std::vector<double> Hy; Hy.resize( mod.num_cells() );
+    std::vector<double> Hz; Hz.resize( mod.num_cells() );
+
+    for (size_t iE = 0; iE < mod.num_entities(); iE++)
+    {
+        auto& e = mod.entity_at(iE);
+        for (size_t iT = 0; iT < e.num_cells(); iT++)
+        {
+            auto& pe = e.cell(iT);
+            auto& re = e.cell_refelem(pe);
+            vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
+            auto num_bf = re.num_basis_functions();
+            auto ofs = e.cell_global_dof_offset(iT);
+            auto gi = e.cell_global_index_by_gmsh(iT);
+            Ex[gi] = emf.Ex.segment(ofs, num_bf).dot(phi_bar);
+            Ey[gi] = emf.Ey.segment(ofs, num_bf).dot(phi_bar);
+            Ez[gi] = emf.Ez.segment(ofs, num_bf).dot(phi_bar);
+            Hx[gi] = emf.Hx.segment(ofs, num_bf).dot(phi_bar);
+            Hy[gi] = emf.Hy.segment(ofs, num_bf).dot(phi_bar);
+            Hz[gi] = emf.Hz.segment(ofs, num_bf).dot(phi_bar);
+        }
+    }
+
+    sdb.write_zonal_variable("Ex", Ex);
+    sdb.write_zonal_variable("Ey", Ey);
+    sdb.write_zonal_variable("Ez", Ez);
+    sdb.write_zonal_variable("Hx", Hx);
+    sdb.write_zonal_variable("Hy", Hy);
+    sdb.write_zonal_variable("Hz", Hz);
+}
+
+int main(int argc, const char *argv[])
+{
+    gmsh::initialize();
+
+    make_geometry(1, 0.05);
+
+    model mod(1,2);
+    solver_state_gpu state;
+    init_from_model(mod, state);
+
+    apply_operator(state, state.emf_curr, state.emf_next);
+
+    electromagnetic_field res;
+    state.emf_next.copyout(res);
+
+    export_to_silo(mod, res, "test.silo");
+
+    //gmsh::finalize();
+
+    return 0;
+}
\ No newline at end of file
diff --git a/src/silo_io.cpp b/src/silo_io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4ab382767cf083be49f65bcd6626ce4c7629066
--- /dev/null
+++ b/src/silo_io.cpp
@@ -0,0 +1,212 @@
+#include <iostream>
+#include <cassert>
+
+#include "gmsh.h"
+#include "silo_output.hpp"
+
+#define INVALID_OFS ((size_t) (~0))
+
+silo::silo()
+{}
+
+silo::~silo()
+{
+    close_db();
+}
+
+void
+silo::gmsh_get_nodes(void)
+{
+    std::vector<size_t>     nodeTags;
+    std::vector<double>     coords;
+    std::vector<double>     paraCoords;
+
+    gmsh::model::mesh::getNodes(nodeTags, coords, paraCoords, -1, -1, true, false);
+
+    auto maxtag_pos = std::max_element(nodeTags.begin(), nodeTags.end());
+    
+    size_t maxtag = 0;
+    if (maxtag_pos != nodeTags.end())
+        maxtag = (*maxtag_pos)+1;
+
+    node_coords_x.resize( nodeTags.size() );
+    node_coords_y.resize( nodeTags.size() );
+    node_coords_z.resize( nodeTags.size() );
+
+    for (size_t i = 0; i < nodeTags.size(); i++)
+    {
+        node_coords_x[i] = coords[3*i+0];
+        node_coords_y[i] = coords[3*i+1];
+        node_coords_z[i] = coords[3*i+2];
+    }
+
+    node_tag2ofs.resize(maxtag, INVALID_OFS);
+    for (size_t i = 0; i < nodeTags.size(); i++)
+        node_tag2ofs.at( nodeTags[i] ) = i;
+}
+
+void
+silo::gmsh_get_elements()
+{
+    gmsh::vectorpair entities;
+    num_cells = 0;
+    gmsh::model::getEntities(entities, 3);
+    for (auto [dim, tag] : entities)
+    {
+        std::vector<int> elemTypes;
+        gmsh::model::mesh::getElementTypes(elemTypes, dim, tag);
+        for (auto& elemType : elemTypes)
+        {
+            std::vector<size_t> elemTags;
+            std::vector<size_t> elemNodeTags;
+            gmsh::model::mesh::getElementsByType(elemType, elemTags, elemNodeTags, tag);
+            auto nodesPerElem = elemNodeTags.size()/elemTags.size();
+            assert( elemTags.size() * nodesPerElem == elemNodeTags.size() );
+        
+            for (size_t i = 0; i < elemTags.size(); i++)
+            {
+                auto base = nodesPerElem * i;
+
+                auto node0_tag = elemNodeTags[base + 0];
+                assert(node0_tag < node_tag2ofs.size());
+                auto node0_ofs = node_tag2ofs[node0_tag];
+                assert(node0_ofs != INVALID_OFS);
+                nodelist.push_back(node0_ofs + 1);
+
+                auto node1_tag = elemNodeTags[base + 1];
+                assert(node1_tag < node_tag2ofs.size());
+                auto node1_ofs = node_tag2ofs[node1_tag];
+                assert(node1_ofs != INVALID_OFS);
+                nodelist.push_back(node1_ofs + 1);
+
+                auto node2_tag = elemNodeTags[base + 2];
+                assert(node2_tag < node_tag2ofs.size());
+                auto node2_ofs = node_tag2ofs[node2_tag];
+                assert(node2_ofs != INVALID_OFS);
+                nodelist.push_back(node2_ofs + 1);
+
+                auto node3_tag = elemNodeTags[base + 3];
+                assert(node3_tag < node_tag2ofs.size());
+                auto node3_ofs = node_tag2ofs[node3_tag];
+                assert(node3_ofs != INVALID_OFS);
+                nodelist.push_back(node3_ofs + 1);
+            }
+
+            num_cells += elemTags.size();
+        }
+    }
+
+    assert(nodelist.size() == 4*num_cells);
+}
+
+void
+silo::import_mesh_from_gmsh()
+{
+    gmsh_get_nodes();
+    gmsh_get_elements();        
+}
+
+bool
+silo::create_db(const std::string& dbname)
+{
+    m_siloDb = DBCreate(dbname.c_str(), DB_CLOBBER, DB_LOCAL, NULL, DB_HDF5);
+    if (m_siloDb)
+        return true;
+
+    std::cout << "SILO: Error creating database" << std::endl;
+    return false;
+}
+
+bool
+silo::write_mesh(double time, int cycle)
+{
+    DBoptlist   *optlist = nullptr;
+
+    if (time >= 0)
+    {
+        optlist = DBMakeOptlist(2);
+        DBAddOption(optlist, DBOPT_CYCLE, &cycle);
+        DBAddOption(optlist, DBOPT_DTIME, &time);
+    }
+
+    double *coords[] = {node_coords_x.data(), node_coords_y.data(), node_coords_z.data()};
+
+    int lnodelist = nodelist.size();
+
+    int shapetype[] = { DB_ZONETYPE_TET };
+    int shapesize[] = {4};
+    int shapecounts[] = { num_cells };
+    int nshapetypes = 1;
+    int nnodes = node_coords_x.size();
+    int nzones = num_cells;
+    int ndims = 3;
+
+    DBPutZonelist2(m_siloDb, "zonelist", nzones, ndims,
+        nodelist.data(), lnodelist, 1, 0, 0, shapetype, shapesize,
+        shapecounts, nshapetypes, NULL);
+
+    DBPutUcdmesh(m_siloDb, "mesh", ndims, NULL, coords, nnodes, nzones,
+        "zonelist", NULL, DB_DOUBLE, optlist);
+
+    if (optlist)
+        DBFreeOptlist(optlist);
+
+    return true;
+}
+
+bool
+silo::write_zonal_variable(const std::string& name, const std::vector<double>& data)
+{
+    if (data.size() != num_cells)
+    {
+        std::cout << "Invalid number of cells" << std::endl;
+        return false;
+    }
+
+    DBPutUcdvar1(m_siloDb, name.c_str(), "mesh", data.data(), data.size(), NULL,
+                    0, DB_DOUBLE, DB_ZONECENT, NULL);
+    
+    return true;
+}
+
+bool
+silo::write_zonal_variable(const std::string& name, const std::vector<float>& data)
+{
+    if (data.size() != num_cells)
+    {
+        std::cout << "Invalid number of cells" << std::endl;
+        return false;
+    }
+
+    DBPutUcdvar1(m_siloDb, name.c_str(), "mesh", data.data(), data.size(), NULL,
+                    0, DB_FLOAT, DB_ZONECENT, NULL);
+    
+    return true;
+}
+
+bool
+silo::write_scalar_expression(const std::string& name, const std::string& expression)
+{
+    const char *names[] = { name.c_str() };
+    const char *defs[] = { expression.c_str() };
+    int type = DB_VARTYPE_SCALAR;
+    DBPutDefvars(m_siloDb, "defvars", 1, names, &type, defs, nullptr);
+}
+
+bool
+silo::write_vector_expression(const std::string& name, const std::string& expression)
+{
+    const char *names[] = { name.c_str() };
+    const char *defs[] = { expression.c_str() };
+    int type = DB_VARTYPE_VECTOR;
+    DBPutDefvars(m_siloDb, "defvars", 1, names, &type, defs, nullptr);
+}
+
+void
+silo::close_db()
+{
+    if (m_siloDb)
+        DBClose(m_siloDb);
+
+    m_siloDb = nullptr;
+}
\ No newline at end of file
diff --git a/tests/profile_differentiation_gpu.cpp b/tests/profile_differentiation_gpu.cpp
index 4ace3de2ac94e8afd3c2ea505ae795d9bd71cd5a..522e8824486592ad1eea6cadae3f293d2e260853 100644
--- a/tests/profile_differentiation_gpu.cpp
+++ b/tests/profile_differentiation_gpu.cpp
@@ -68,7 +68,7 @@ int profile_differentiation(int geometric_order, int approximation_order)
         timecounter_gpu tc;
         tc.tic();
         gpu_compute_field_derivatives(edg, Pf_gpu.get_texture(), df_dx_gpu.data(),
-            df_dy_gpu.data(), df_dz_gpu.data());
+            df_dy_gpu.data(), df_dz_gpu.data(), 1.0);
         double time = tc.toc();
 
         vecxd df_dx_cpu = vecxd::Zero( Pf_cpu.size() );
@@ -82,7 +82,7 @@ int profile_differentiation(int geometric_order, int approximation_order)
         if (geometric_order == 1)
         {
             std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-            double flops = 21*ed.num_bf*ed.num_bf*e0.num_cells();
+            double flops = (21*ed.num_bf + 3)*ed.num_bf*e0.num_cells();
             std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
         }
         else
diff --git a/tests/test_differentiation_gpu.cpp b/tests/test_differentiation_gpu.cpp
index 4431c0a1297518700015d0e60ba3dd22f25bad81..b848773d8bfa704c6d9b93508df56c63e3ce6e87 100644
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -123,7 +123,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
             reshape_dofs(eds[i], edgs[i], Pf, Pf_reshaped, true);
         texture_allocator<double> Pf_gpu(Pf_reshaped.data(), Pf_reshaped.size());
 #else
-        texture_allocator<double> Pf_gpu(Pf.data(), Pf.size());
+        device_vector<double> Pf_gpu(Pf.data(), Pf.size());
 #endif
         device_vector<double> df_dx_gpu(model_num_dofs_gpu);
         device_vector<double> df_dy_gpu(model_num_dofs_gpu);
@@ -133,15 +133,15 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
         {
             timecounter_gpu tc;
             tc.tic();
-            gpu_compute_field_derivatives(edg, Pf_gpu.get_texture(),
-                df_dx_gpu.data(), df_dy_gpu.data(), df_dz_gpu.data());
+            gpu_compute_field_derivatives(edg, Pf_gpu.data(),
+                df_dx_gpu.data(), df_dy_gpu.data(), df_dz_gpu.data(), 1.0);
             double time = tc.toc();
 
             auto num_cells = edg.num_all_elems;
             if (geometric_order == 1)
             {
                 std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
-                double flops = 21*edg.num_bf*edg.num_bf*num_cells;
+                double flops = (21*edg.num_bf + 3)*edg.num_bf*num_cells;
                 std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
             }
             else
@@ -270,7 +270,7 @@ int main(void)
 
     std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
     for (size_t go = 1; go < 2; go++)
-        for (size_t ao = go; ao < 5; ao++)
+        for (size_t ao = 1; ao < 7; ao++)
             failed_tests += test_differentiation_convergence(go, ao);
 
     return failed_tests;