diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h
index 8277b482fa9a55da8a26192f457e54697b5465d8..d5f7626fe4a4dfef5ab8288ed7f2e00204c354b5 100644
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -314,7 +314,7 @@ void init_from_model(const model&, solver_state&);
 void init_matparams(const model&, solver_state&, const parameter_loader&);
 //void apply_operator(solver_state&, const field&, field&);
 void export_fields_to_silo(const model&, const solver_state&, const parameter_loader&);
-void timestep(solver_state&, time_integrator_type);
+void timestep(solver_state&, const parameter_loader&, time_integrator_type);
 void prepare_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
 void do_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
 void swap(maxwell::solver_state&);
@@ -324,7 +324,7 @@ void init_from_model(const model&, solver_state_gpu&);
 void init_matparams(const model&, solver_state_gpu&, const parameter_loader&);
 //void apply_operator(solver_state_gpu&, const field_gpu&, field_gpu&);
 void export_fields_to_silo(const model&, const solver_state_gpu&, const parameter_loader&);
-void timestep(solver_state_gpu&, time_integrator_type);
+void timestep(solver_state_gpu&, const parameter_loader&, time_integrator_type);
 void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
 void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
 void swap(maxwell::solver_state_gpu&);
@@ -398,13 +398,13 @@ void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
     double *, gpuStream_t stream = 0);
 
 void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 
 void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 
 void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 
 void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs,
     field_gpu& srcs);
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 2c88604fc3818c5dc1cd3c66e3055529034aec4f..050d27fa568239409f1c1de837aa73bccfd34993 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -227,7 +227,7 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
     for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
     {
         int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
-        int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes);
+        int32_t f_ofs = elem_flux_base + dof;
         acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
     }
 
@@ -304,7 +304,7 @@ void
 gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs,
     cudaStream_t stream)
 {
-    static const size_t SUBTRACT_THREADS = 256;
+    static const size_t SUBTRACT_THREADS = 128;
     auto gs = num_dofs/SUBTRACT_THREADS;
     if (num_dofs % SUBTRACT_THREADS != 0)
         gs += 1;
@@ -326,7 +326,7 @@ void
 gpu_curl(double *dst, const double *add, const double *sub, const double *source,
     size_t num_dofs, cudaStream_t stream)
 {
-    static const size_t SUBTRACT_THREADS = 256;
+    static const size_t SUBTRACT_THREADS = 128;
     auto gs = num_dofs/SUBTRACT_THREADS;
     if (num_dofs % SUBTRACT_THREADS != 0)
         gs += 1;
@@ -468,8 +468,9 @@ gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict
     if (dof < num_dofs)
     {
         double dt_sigma_over_2eps = 0.5*dt*soe[dof];
-        double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps);
-        double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps);
+        double k = 1./(1.0 + dt_sigma_over_2eps);
+        double CR = (1.0 - dt_sigma_over_2eps)*k;
+        double CC = dt*ie[dof]*k;
 
         next[dof] = curr[dof]*CR + tmp[dof]*CC;
     }
@@ -480,7 +481,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe,
     const double *tmp, const double *ie, size_t num_dofs, double dt,
     cudaStream_t stream)
 {
-    static const size_t EULER_THREADS = 256;
+    static const size_t EULER_THREADS = 128;
 
     auto gs = num_dofs/EULER_THREADS;
     if (num_dofs % EULER_THREADS != 0)
@@ -508,7 +509,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr,
     const double *tmp, const double *im, size_t num_dofs, double dt,
     cudaStream_t stream)
 {
-    static const size_t EULER_THREADS = 256;
+    static const size_t EULER_THREADS = 128;
 
     auto gs = num_dofs/EULER_THREADS;
     if (num_dofs % EULER_THREADS != 0)
diff --git a/src/maxwell_cpu.cpp b/src/maxwell_cpu.cpp
index 7834684b72a9a442e0caefdf061a1965a3f427c8..4adfdecb5bfe840d586f9a69d6f5cd6927f13edb 100644
--- a/src/maxwell_cpu.cpp
+++ b/src/maxwell_cpu.cpp
@@ -505,7 +505,7 @@ leapfrog(solver_state& state)
 }
 
 void
-timestep(solver_state& state, time_integrator_type ti)
+timestep(solver_state& state, const parameter_loader&, time_integrator_type ti)
 {
     if (ti == time_integrator_type::EULER)
     {
diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp
index b1d5bad9711fe4929d2ed6c7c98442161ee4b986..df2409056898526477370ed1c13c0373da9fa769 100644
--- a/src/maxwell_gpu.cpp
+++ b/src/maxwell_gpu.cpp
@@ -150,7 +150,7 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field&
 }
 
 static void
-compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
 {
     auto currp = curr.data();
     auto nextp = next.data();
@@ -172,9 +172,19 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
     auto Jy = state.Jy_src_gpu.data();
     auto Jz = state.Jz_src_gpu.data();
     
-    gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
-    gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
-    gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
+    if (use_sources)
+    {
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
+    }
+    else
+    {
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
+    }
+
     gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
     gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
     gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
@@ -203,7 +213,7 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
 }
 
 static void
-compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
 {
     auto currp = curr.data();
     auto nextp = next.data();
@@ -218,17 +228,26 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
         gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
     }
 
-    auto Jx = state.Jx_src_gpu.data();
-    auto Jy = state.Jy_src_gpu.data();
-    auto Jz = state.Jz_src_gpu.data();
+    if (use_sources)
+    {
+        auto Jx = state.Jx_src_gpu.data();
+        auto Jy = state.Jy_src_gpu.data();
+        auto Jz = state.Jz_src_gpu.data();
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
+    }
+    else
+    {
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
+    }
 
-    gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
-    gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
-    gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
 }
 
 static void
-compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
     auto matparams_ptrs = state.matparams.data();
     double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -238,7 +257,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 
     for (const auto& edg : state.edgs)
         maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
 
     auto fluxesp = state.fluxes.data();
     auto outp = out.data();
@@ -255,7 +274,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 
 static void
-compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
     auto matparams_ptrs = state.matparams.data();
     double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -265,7 +284,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 
     for (const auto& edg : state.edgs)
         maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
 
     auto fluxesp = state.fluxes.data();
     auto outp = out.data();
@@ -279,7 +298,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 
 static void
-compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
     auto matparams_ptrs = state.matparams.data();
     double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -289,7 +308,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 
     for (const auto& edg : state.edgs)
         maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
 
     auto fluxesp = state.fluxes.data();
     auto outp = out.data();
@@ -303,7 +322,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 
 static void
-leapfrog(solver_state_gpu& state)
+leapfrog(solver_state_gpu& state, const parameter_loader& mpl)
 {
     double *r = state.matparams.sigma_over_epsilon.data();
     double *eps = state.matparams.inv_epsilon.data();
@@ -313,10 +332,14 @@ leapfrog(solver_state_gpu& state)
     auto nextp = state.emf_next.data();
     auto tmpp = state.tmp.data();
 
-    auto dt = state.delta_t;
-    compute_curls_H(state, state.emf_curr, state.tmp);
-    compute_fluxes_H(state, state.emf_curr, state.tmp);
+    bool ve = mpl.volume_sources_enabled();
+    compute_curls_H(state, state.emf_curr, state.tmp, ve);
+
+    auto be = mpl.boundary_sources_enabled();
+    auto ie = mpl.interface_sources_enabled();
+    compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie);
     
+    auto dt = state.delta_t;
     gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
         dt, state.compute_stream);
     gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
@@ -325,7 +348,7 @@ leapfrog(solver_state_gpu& state)
         dt, state.compute_stream);
 
     compute_curls_E(state, state.emf_next, state.tmp);
-    compute_fluxes_E(state, state.emf_next, state.tmp);
+    compute_fluxes_E(state, state.emf_next, state.tmp, be or ie);
 
     gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
         dt, state.compute_stream);
@@ -336,10 +359,14 @@ leapfrog(solver_state_gpu& state)
 }
 
 static void
-apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr,
+    field_gpu& next)
 {
-    compute_curls(state, curr, next);
-    compute_fluxes(state, curr, next);
+    bool ve = mpl.volume_sources_enabled();
+    auto be = mpl.boundary_sources_enabled();
+    auto ie = mpl.interface_sources_enabled();
+    compute_curls(state, curr, next, ve);
+    compute_fluxes(state, curr, next, be or ie);
 }
 
 static void
@@ -386,36 +413,36 @@ compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
 }
 
 void
-timestep(solver_state_gpu& state, time_integrator_type ti)
+timestep(solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti)
 {
     //timecounter tc;
     //tc.tic();
 
     if (ti == time_integrator_type::EULER)
     {
-        apply_operator(state, state.emf_curr, state.tmp);
+        apply_operator(state, mpl, state.emf_curr, state.tmp);
         compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
     }
 
     if (ti == time_integrator_type::RK4)
     {
-        apply_operator(state, state.emf_curr, state.k1);
+        apply_operator(state, mpl, state.emf_curr, state.k1);
 
         compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k2);
+        apply_operator(state, mpl, state.tmp, state.k2);
 
         compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k3);
-
+        apply_operator(state, mpl, state.tmp, state.k3);
+        
         compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k4);
+        apply_operator(state, mpl, state.tmp, state.k4);
 
         compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
     }
 
     if (ti == time_integrator_type::LEAPFROG)
     {
-        leapfrog(state);
+        leapfrog(state, mpl);
     }
 
     state.curr_time += state.delta_t;
@@ -483,6 +510,9 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
         state.Jx_src_gpu.zero();
         state.Jy_src_gpu.zero();
         state.Jz_src_gpu.zero();
+        state.Jx_src_gpu_buf.zero();
+        state.Jy_src_gpu_buf.zero();
+        state.Jz_src_gpu_buf.zero();
         state.bndsrcs_decomp_cpu.zero();
         state.bndsrcs_cpu.zero();
         mpl.source_was_cleared();
@@ -510,13 +540,18 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
 }
 
 void
-swap(maxwell::solver_state_gpu& state)
+swap(solver_state_gpu& state)
 {
     checkGPU( gpuDeviceSynchronize() );
+
+    //auto be = mpl.boundary_sources_enabled();
+    //auto ie = mpl.interface_sources_enabled()
+    //if ( be or ie )
+        decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
+    
     std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf);
     std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf);
     std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf);
-    decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
     std::swap(state.emf_curr, state.emf_next);
 }
 
diff --git a/src/maxwell_kernels_cuda.cu b/src/maxwell_kernels_cuda.cu
index 7365e927a56e267686e42e6b4c59ffbd8c2ad7c7..3e7d848f49dd24dd1f4f2fec98b0aac68a7d3022 100644
--- a/src/maxwell_kernels_cuda.cu
+++ b/src/maxwell_kernels_cuda.cu
@@ -42,7 +42,7 @@ void
 gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
     double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
 
     auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
 
@@ -90,7 +90,7 @@ void
 gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
     double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
 
     auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
 
@@ -122,7 +122,7 @@ void
 gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
     double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
 
     auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
 
@@ -150,12 +150,12 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu&
     checkGPU(cudaPeekAtLastError());
 }
 
-template<size_t K>
+template<size_t K, bool do_E, bool do_H>
 __global__ void
 gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
     field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
     const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
+    size_t flux_base, size_t num_fluxes, bool use_sources)
 {
     using KS = kernel_gpu_sizes<K>;
     auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
@@ -171,112 +171,51 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_
     auto ny = face_normals[3*entry_num + 1];
     auto nz = face_normals[3*entry_num + 2];
 
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
+    auto jEx = jump.Ex[global_dof_pos];
+    auto jEy = jump.Ey[global_dof_pos];
+    auto jEz = jump.Ez[global_dof_pos];
+    auto jHx = jump.Hx[global_dof_pos];
+    auto jHy = jump.Hy[global_dof_pos];
+    auto jHz = jump.Hz[global_dof_pos];
 
-    auto ndotE = nx*jEx + ny*jEy + nz*jEz;
-    auto ndotH = nx*jHx + ny*jHy + nz*jHz;
-    
-    auto aE = face_det * fcp.aE[global_dof_pos];
-    auto bE = face_det * fcp.bE[global_dof_pos];
-    auto aH = face_det * fcp.aH[global_dof_pos];
-    auto bH = face_det * fcp.bH[global_dof_pos];
- 
-    flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
-    flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
-    flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
-    flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
-    flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
-    flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
-}
-
-template<size_t K>
-__global__ void
-gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
-    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
-    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
-{
-    using KS = kernel_gpu_sizes<K>;
-    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
-    auto global_dof_pos = flux_base + local_dof_pos;
-
-    auto entry_num = local_dof_pos/KS::num_fluxes;
-
-    if (local_dof_pos >= num_fluxes)
-        return;
-
-    auto face_det = face_dets[entry_num];
-    auto nx = face_normals[3*entry_num + 0];
-    auto ny = face_normals[3*entry_num + 1];
-    auto nz = face_normals[3*entry_num + 2];
-
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
-
-    auto ndotE = nx*jEx + ny*jEy + nz*jEz;
-
-    auto aE = face_det * fcp.aE[global_dof_pos];
-    auto bE = face_det * fcp.bE[global_dof_pos];
- 
-    flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
-    flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
-    flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
-}
-
-template<size_t K>
-__global__ void
-gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
-    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
-    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
-{
-    using KS = kernel_gpu_sizes<K>;
-    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
-    auto global_dof_pos = flux_base + local_dof_pos;
-
-    auto entry_num = local_dof_pos/KS::num_fluxes;
-
-    if (local_dof_pos >= num_fluxes)
-        return;
-
-    auto face_det = face_dets[entry_num];
-    auto nx = face_normals[3*entry_num + 0];
-    auto ny = face_normals[3*entry_num + 1];
-    auto nz = face_normals[3*entry_num + 2];
+    if (use_sources)
+    {
+        jEx += - bndsrc.Ex[global_dof_pos];
+        jEy += - bndsrc.Ey[global_dof_pos];
+        jEz += - bndsrc.Ez[global_dof_pos];
+        jHx += + bndsrc.Hx[global_dof_pos];
+        jHy += + bndsrc.Hy[global_dof_pos];
+        jHz += + bndsrc.Hz[global_dof_pos];
+    }
 
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
+    if constexpr(do_E)
+    {
+        auto ndotE = nx*jEx + ny*jEy + nz*jEz;
+        auto aE = face_det * fcp.aE[global_dof_pos];
+        auto bE = face_det * fcp.bE[global_dof_pos];
+        flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
+        flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
+        flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
+    }
 
-    auto ndotH = nx*jHx + ny*jHy + nz*jHz;
-    
-    auto aH = face_det * fcp.aH[global_dof_pos];
-    auto bH = face_det * fcp.bH[global_dof_pos];
- 
-    flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
-    flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
-    flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
+    if constexpr(do_H)
+    {
+        auto ndotH = nx*jHx + ny*jHy + nz*jHz;
+        auto aH = face_det * fcp.aH[global_dof_pos];
+        auto bH = face_det * fcp.bH[global_dof_pos];    
+        flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
+        flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
+        flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
+    }
 }
 
 template<size_t K, typename Kernel>
 void
 launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
-    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
+    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, bool use_sources,
     cudaStream_t stream)
 {
-    static const size_t FLUX_THREADS = 256;
+    static const size_t FLUX_THREADS = 1024;
 
     auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
 
@@ -292,38 +231,50 @@ launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const f
 
     kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
         edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
-        num_all_fluxes);
+        num_all_fluxes, use_sources);
 }
 
 void
 gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
     field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
     switch( edg.a_order )
     {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 1: launch_compute_fluxes_kernel<1>(
+                gpu_compute_fluxes_kernel_planar<1, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
         
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 2: launch_compute_fluxes_kernel<2>(
+                gpu_compute_fluxes_kernel_planar<2, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
         
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 3: launch_compute_fluxes_kernel<3>(
+                gpu_compute_fluxes_kernel_planar<3, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
 
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 4: launch_compute_fluxes_kernel<4>(
+                gpu_compute_fluxes_kernel_planar<4, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
 
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 5: launch_compute_fluxes_kernel<5>(
+                gpu_compute_fluxes_kernel_planar<5, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
 
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+        case 6: launch_compute_fluxes_kernel<6>(
+                gpu_compute_fluxes_kernel_planar<6, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
             break;
 
         default: throw std::invalid_argument("unsupported order");
@@ -333,33 +284,45 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
 void
 gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
     field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
     switch( edg.a_order )
     {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-        
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-        
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
+        case 1: launch_compute_fluxes_kernel<1>(
+            gpu_compute_fluxes_kernel_planar<1, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+    
+    case 2: launch_compute_fluxes_kernel<2>(
+            gpu_compute_fluxes_kernel_planar<2, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+    
+    case 3: launch_compute_fluxes_kernel<3>(
+            gpu_compute_fluxes_kernel_planar<3, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 4: launch_compute_fluxes_kernel<4>(
+            gpu_compute_fluxes_kernel_planar<4, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 5: launch_compute_fluxes_kernel<5>(
+            gpu_compute_fluxes_kernel_planar<5, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 6: launch_compute_fluxes_kernel<6>(
+            gpu_compute_fluxes_kernel_planar<6, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
 
         default: throw std::invalid_argument("unsupported order");
     }
@@ -368,33 +331,45 @@ gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
 void
 gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
     field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
     switch( edg.a_order )
     {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-        
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-        
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
-
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>,
-            edg, jumps, fluxes, bndsrc, fcp, stream);
-            break;
+        case 1: launch_compute_fluxes_kernel<1>(
+            gpu_compute_fluxes_kernel_planar<1, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+    
+    case 2: launch_compute_fluxes_kernel<2>(
+            gpu_compute_fluxes_kernel_planar<2, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+    
+    case 3: launch_compute_fluxes_kernel<3>(
+            gpu_compute_fluxes_kernel_planar<3, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 4: launch_compute_fluxes_kernel<4>(
+            gpu_compute_fluxes_kernel_planar<4, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 5: launch_compute_fluxes_kernel<5>(
+            gpu_compute_fluxes_kernel_planar<5, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
+
+    case 6: launch_compute_fluxes_kernel<6>(
+            gpu_compute_fluxes_kernel_planar<6, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
+        break;
 
         default: throw std::invalid_argument("unsupported order");
     }
diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp
index 69ee2b95ce30a2e91c9bebd66be8b562e1874b52..78c09f6c7d4164897e9e0513c29d42c8f1e35070 100644
--- a/src/maxwell_solver.cpp
+++ b/src/maxwell_solver.cpp
@@ -14,6 +14,7 @@
 #include "maxwell_interface.h"
 #include "maxwell_common.h"
 #include "param_loader.h"
+#include "timecounter.h"
 
 #if 0
 static void
@@ -86,23 +87,28 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
     std::cout << "s each." << std::endl;
     std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl;
 
+    timecounter tc;
+    tc.tic();
     prepare_sources(mod, state, mpl);
     for(size_t i = 0; i < num_timesteps; i++)
     {
         mpl.call_timestep_callback(i);
-        timestep(state, ti);
+        timestep(state, mpl, ti);
         do_sources(mod, state, mpl);
         
 
         if (i%silo_output_rate == 0)
             export_fields_to_silo(mod, state, mpl);
 
+        swap(state);
+
         if (i%cycle_print_rate == 0)
         {
+            double time = tc.toc();
             std::cout << "Cycle " << i << ": t = " << state.curr_time << " s";
-            std::cout << std::endl;
+            std::cout << ", DOFs/s: " << (mod.num_dofs()*6*cycle_print_rate)/(time) << std::endl;
+            tc.tic();
         }
-        swap(state);
     }
 }