Performance optimizations.

15378f6e · Matteo Cicuttin · 6ed2b36e · 15378f6e · 15378f6e · 15378f6e
Commit 15378f6e authored 4 years ago by Matteo Cicuttin
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -314,7 +314,7 @@ void init_from_model(const model&, solver_state&);
 void init_matparams(const model&, solver_state&, const parameter_loader&);
 //void apply_operator(solver_state&, const field&, field&);
 void export_fields_to_silo(const model&, const solver_state&, const parameter_loader&);
-void timestep(solver_state&, time_integrator_type);
+void timestep(solver_state&, const parameter_loader&, time_integrator_type);
 void prepare_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
 void do_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
 void swap(maxwell::solver_state&);
@@ -324,7 +324,7 @@ void init_from_model(const model&, solver_state_gpu&);
 void init_matparams(const model&, solver_state_gpu&, const parameter_loader&);
 //void apply_operator(solver_state_gpu&, const field_gpu&, field_gpu&);
 void export_fields_to_silo(const model&, const solver_state_gpu&, const parameter_loader&);
-void timestep(solver_state_gpu&, time_integrator_type);
+void timestep(solver_state_gpu&, const parameter_loader&, time_integrator_type);
 void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
 void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
 void swap(maxwell::solver_state_gpu&);
@@ -398,13 +398,13 @@ void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
    double *, gpuStream_t stream = 0);
 void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
-    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+    const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
 void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs,
    field_gpu& srcs);

--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -227,7 +227,7 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
    for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
    {
        int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
-        int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes);
+        int32_t f_ofs = elem_flux_base + dof;
        acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
    }
@@ -304,7 +304,7 @@ void
 gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs,
    cudaStream_t stream)
 {
-    static const size_t SUBTRACT_THREADS = 256;
+    static const size_t SUBTRACT_THREADS = 128;
    auto gs = num_dofs/SUBTRACT_THREADS;
    if (num_dofs % SUBTRACT_THREADS != 0)
        gs += 1;
@@ -326,7 +326,7 @@ void
 gpu_curl(double *dst, const double *add, const double *sub, const double *source,
    size_t num_dofs, cudaStream_t stream)
 {
-    static const size_t SUBTRACT_THREADS = 256;
+    static const size_t SUBTRACT_THREADS = 128;
    auto gs = num_dofs/SUBTRACT_THREADS;
    if (num_dofs % SUBTRACT_THREADS != 0)
        gs += 1;
@@ -468,8 +468,9 @@ gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict
    if (dof < num_dofs)
    {
        double dt_sigma_over_2eps = 0.5*dt*soe[dof];
-        double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps);
+        double k = 1./(1.0 + dt_sigma_over_2eps);
-        double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps);
+        double CR = (1.0 - dt_sigma_over_2eps)*k;
+        double CC = dt*ie[dof]*k;
        next[dof] = curr[dof]*CR + tmp[dof]*CC;
    }
@@ -480,7 +481,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe,
    const double *tmp, const double *ie, size_t num_dofs, double dt,
    cudaStream_t stream)
 {
-    static const size_t EULER_THREADS = 256;
+    static const size_t EULER_THREADS = 128;
    auto gs = num_dofs/EULER_THREADS;
    if (num_dofs % EULER_THREADS != 0)
@@ -508,7 +509,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr,
    const double *tmp, const double *im, size_t num_dofs, double dt,
    cudaStream_t stream)
 {
-    static const size_t EULER_THREADS = 256;
+    static const size_t EULER_THREADS = 128;
    auto gs = num_dofs/EULER_THREADS;
    if (num_dofs % EULER_THREADS != 0)

--- a/src/maxwell_cpu.cpp
+++ b/src/maxwell_cpu.cpp
@@ -505,7 +505,7 @@ leapfrog(solver_state& state)
 }
 void
-timestep(solver_state& state, time_integrator_type ti)
+timestep(solver_state& state, const parameter_loader&, time_integrator_type ti)
 {
    if (ti == time_integrator_type::EULER)
    {

--- a/src/maxwell_gpu.cpp
+++ b/src/maxwell_gpu.cpp
@@ -150,7 +150,7 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field&
 }
 static void
-compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
 {
    auto currp = curr.data();
    auto nextp = next.data();
@@ -172,9 +172,19 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
    auto Jy = state.Jy_src_gpu.data();
    auto Jz = state.Jz_src_gpu.data();
+    if (use_sources)
+    {
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
+    }
+    else
+    {
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
+    }
    gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
    gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
@@ -203,7 +213,7 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
 }
 static void
-compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
 {
    auto currp = curr.data();
    auto nextp = next.data();
@@ -218,17 +228,26 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
        gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
    }
+    if (use_sources)
+    {
        auto Jx = state.Jx_src_gpu.data();
        auto Jy = state.Jy_src_gpu.data();
        auto Jz = state.Jz_src_gpu.data();
        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
    }
+    else
+    {
+        gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
+        gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
+    }
+}
 static void
-compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -238,7 +257,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
    auto fluxesp = state.fluxes.data();
    auto outp = out.data();
@@ -255,7 +274,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 static void
-compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -265,7 +284,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
    auto fluxesp = state.fluxes.data();
    auto outp = out.data();
@@ -279,7 +298,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 static void
-compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
 {
    auto matparams_ptrs = state.matparams.data();
    double *bc_coeffs = matparams_ptrs.bc_coeffs;
@@ -289,7 +308,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
    for (const auto& edg : state.edgs)
        maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
-            state.matparams, state.compute_stream);
+            state.matparams, use_sources, state.compute_stream);
    auto fluxesp = state.fluxes.data();
    auto outp = out.data();
@@ -303,7 +322,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 }
 static void
-leapfrog(solver_state_gpu& state)
+leapfrog(solver_state_gpu& state, const parameter_loader& mpl)
 {
    double *r = state.matparams.sigma_over_epsilon.data();
    double *eps = state.matparams.inv_epsilon.data();
@@ -313,10 +332,14 @@ leapfrog(solver_state_gpu& state)
    auto nextp = state.emf_next.data();
    auto tmpp = state.tmp.data();
-    auto dt = state.delta_t;
+    bool ve = mpl.volume_sources_enabled();
-    compute_curls_H(state, state.emf_curr, state.tmp);
+    compute_curls_H(state, state.emf_curr, state.tmp, ve);
-    compute_fluxes_H(state, state.emf_curr, state.tmp);
+    auto be = mpl.boundary_sources_enabled();
+    auto ie = mpl.interface_sources_enabled();
+    compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie);
+    auto dt = state.delta_t;
    gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
        dt, state.compute_stream);
    gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
@@ -325,7 +348,7 @@ leapfrog(solver_state_gpu& state)
        dt, state.compute_stream);
    compute_curls_E(state, state.emf_next, state.tmp);
-    compute_fluxes_E(state, state.emf_next, state.tmp);
+    compute_fluxes_E(state, state.emf_next, state.tmp, be or ie);
    gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
        dt, state.compute_stream);
@@ -336,10 +359,14 @@ leapfrog(solver_state_gpu& state)
 }
 static void
-apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr,
+    field_gpu& next)
 {
-    compute_curls(state, curr, next);
+    bool ve = mpl.volume_sources_enabled();
-    compute_fluxes(state, curr, next);
+    auto be = mpl.boundary_sources_enabled();
+    auto ie = mpl.interface_sources_enabled();
+    compute_curls(state, curr, next, ve);
+    compute_fluxes(state, curr, next, be or ie);
 }
 static void
@@ -386,36 +413,36 @@ compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
 }
 void
-timestep(solver_state_gpu& state, time_integrator_type ti)
+timestep(solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti)
 {
    //timecounter tc;
    //tc.tic();
    if (ti == time_integrator_type::EULER)
    {
-        apply_operator(state, state.emf_curr, state.tmp);
+        apply_operator(state, mpl, state.emf_curr, state.tmp);
        compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
    }
    if (ti == time_integrator_type::RK4)
    {
-        apply_operator(state, state.emf_curr, state.k1);
+        apply_operator(state, mpl, state.emf_curr, state.k1);
        compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k2);
+        apply_operator(state, mpl, state.tmp, state.k2);
        compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k3);
+        apply_operator(state, mpl, state.tmp, state.k3);
        compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
-        apply_operator(state, state.tmp, state.k4);
+        apply_operator(state, mpl, state.tmp, state.k4);
        compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
    }
    if (ti == time_integrator_type::LEAPFROG)
    {
-        leapfrog(state);
+        leapfrog(state, mpl);
    }
    state.curr_time += state.delta_t;
@@ -483,6 +510,9 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
        state.Jx_src_gpu.zero();
        state.Jy_src_gpu.zero();
        state.Jz_src_gpu.zero();
+        state.Jx_src_gpu_buf.zero();
+        state.Jy_src_gpu_buf.zero();
+        state.Jz_src_gpu_buf.zero();
        state.bndsrcs_decomp_cpu.zero();
        state.bndsrcs_cpu.zero();
        mpl.source_was_cleared();
@@ -510,13 +540,18 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
 }
 void
-swap(maxwell::solver_state_gpu& state)
+swap(solver_state_gpu& state)
 {
    checkGPU( gpuDeviceSynchronize() );
+    //auto be = mpl.boundary_sources_enabled();
+    //auto ie = mpl.interface_sources_enabled()
+    //if ( be or ie )
+        decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
    std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf);
    std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf);
    std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf);
-    decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
    std::swap(state.emf_curr, state.emf_next);
 }

--- a/src/maxwell_kernels_cuda.cu
+++ b/src/maxwell_kernels_cuda.cu
@@ -42,7 +42,7 @@ void
 gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
    double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
@@ -90,7 +90,7 @@ void
 gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
    double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
@@ -122,7 +122,7 @@ void
 gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
    double *bc_coeffs, cudaStream_t stream)
 {
-    static const size_t JUMP_THREADS = 256;
+    static const size_t JUMP_THREADS = 128;
    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
@@ -150,12 +150,12 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu&
    checkGPU(cudaPeekAtLastError());
 }
-template<size_t K>
+template<size_t K, bool do_E, bool do_H>
 __global__ void
 gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
+    size_t flux_base, size_t num_fluxes, bool use_sources)
 {
    using KS = kernel_gpu_sizes<K>;
    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
@@ -171,112 +171,51 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_
    auto ny = face_normals[3*entry_num + 1];
    auto nz = face_normals[3*entry_num + 2];
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
+    auto jEx = jump.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
+    auto jEy = jump.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
+    auto jEz = jump.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
+    auto jHx = jump.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
+    auto jHy = jump.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
+    auto jHz = jump.Hz[global_dof_pos];
-    auto ndotE = nx*jEx + ny*jEy + nz*jEz;
+    if (use_sources)
-    auto ndotH = nx*jHx + ny*jHy + nz*jHz;
+    {
+        jEx += - bndsrc.Ex[global_dof_pos];
-    auto aE = face_det * fcp.aE[global_dof_pos];
+        jEy += - bndsrc.Ey[global_dof_pos];
-    auto bE = face_det * fcp.bE[global_dof_pos];
+        jEz += - bndsrc.Ez[global_dof_pos];
-    auto aH = face_det * fcp.aH[global_dof_pos];
+        jHx += + bndsrc.Hx[global_dof_pos];
-    auto bH = face_det * fcp.bH[global_dof_pos];
+        jHy += + bndsrc.Hy[global_dof_pos];
+        jHz += + bndsrc.Hz[global_dof_pos];
-    flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
-    flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
-    flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
-    flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
-    flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
-    flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
    }
-template<size_t K>
+    if constexpr(do_E)
-__global__ void
-gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
-    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
-    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
    {
-    using KS = kernel_gpu_sizes<K>;
-    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
-    auto global_dof_pos = flux_base + local_dof_pos;
-    auto entry_num = local_dof_pos/KS::num_fluxes;
-    if (local_dof_pos >= num_fluxes)
-        return;
-    auto face_det = face_dets[entry_num];
-    auto nx = face_normals[3*entry_num + 0];
-    auto ny = face_normals[3*entry_num + 1];
-    auto nz = face_normals[3*entry_num + 2];
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
        auto ndotE = nx*jEx + ny*jEy + nz*jEz;
        auto aE = face_det * fcp.aE[global_dof_pos];
        auto bE = face_det * fcp.bE[global_dof_pos];
        flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
        flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
        flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
    }
-template<size_t K>
+    if constexpr(do_H)
-__global__ void
-gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
-    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
-    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
-    size_t flux_base, size_t num_fluxes)
    {
-    using KS = kernel_gpu_sizes<K>;
-    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
-    auto global_dof_pos = flux_base + local_dof_pos;
-    auto entry_num = local_dof_pos/KS::num_fluxes;
-    if (local_dof_pos >= num_fluxes)
-        return;
-    auto face_det = face_dets[entry_num];
-    auto nx = face_normals[3*entry_num + 0];
-    auto ny = face_normals[3*entry_num + 1];
-    auto nz = face_normals[3*entry_num + 2];
-    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
-    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
-    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
-    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
-    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
-    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
        auto ndotH = nx*jHx + ny*jHy + nz*jHz;
        auto aH = face_det * fcp.aH[global_dof_pos];
        auto bH = face_det * fcp.bH[global_dof_pos];    
        flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
        flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
        flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
    }
+}
 template<size_t K, typename Kernel>
 void
 launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
-    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
+    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, bool use_sources,
    cudaStream_t stream)
 {
-    static const size_t FLUX_THREADS = 256;
+    static const size_t FLUX_THREADS = 1024;
    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
@@ -292,38 +231,50 @@ launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const f
    kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
        edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
-        num_all_fluxes);
+        num_all_fluxes, use_sources);
 }
 void
 gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
    switch( edg.a_order )
    {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>,
+        case 1: launch_compute_fluxes_kernel<1>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<1, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>,
+        case 2: launch_compute_fluxes_kernel<2>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<2, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>,
+        case 3: launch_compute_fluxes_kernel<3>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<3, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>,
+        case 4: launch_compute_fluxes_kernel<4>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<4, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>,
+        case 5: launch_compute_fluxes_kernel<5>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<5, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>,
+        case 6: launch_compute_fluxes_kernel<6>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+                gpu_compute_fluxes_kernel_planar<6, true, true>,
+                edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+            );
            break;
        default: throw std::invalid_argument("unsupported order");
@@ -333,32 +284,44 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
 void
 gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
    switch( edg.a_order )
    {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>,
+        case 1: launch_compute_fluxes_kernel<1>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<1, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>,
+    case 2: launch_compute_fluxes_kernel<2>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<2, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>,
+    case 3: launch_compute_fluxes_kernel<3>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<3, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>,
+    case 4: launch_compute_fluxes_kernel<4>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<4, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>,
+    case 5: launch_compute_fluxes_kernel<5>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<5, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>,
+    case 6: launch_compute_fluxes_kernel<6>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<6, true, false>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
        default: throw std::invalid_argument("unsupported order");
@@ -368,32 +331,44 @@ gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
 void
 gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
-    cudaStream_t stream)
+    bool use_sources, cudaStream_t stream)
 {
    switch( edg.a_order )
    {
-        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>,
+        case 1: launch_compute_fluxes_kernel<1>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<1, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>,
+    case 2: launch_compute_fluxes_kernel<2>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<2, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>,
+    case 3: launch_compute_fluxes_kernel<3>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<3, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>,
+    case 4: launch_compute_fluxes_kernel<4>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<4, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>,
+    case 5: launch_compute_fluxes_kernel<5>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<5, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
-        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>,
+    case 6: launch_compute_fluxes_kernel<6>(
-            edg, jumps, fluxes, bndsrc, fcp, stream);
+            gpu_compute_fluxes_kernel_planar<6, false, true>,
+            edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
+        );
        break;
        default: throw std::invalid_argument("unsupported order");

--- a/src/maxwell_solver.cpp
+++ b/src/maxwell_solver.cpp
@@ -14,6 +14,7 @@
 #include "maxwell_interface.h"
 #include "maxwell_common.h"
 #include "param_loader.h"
+#include "timecounter.h"
 #if 0
 static void
@@ -86,23 +87,28 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
    std::cout << "s each." << std::endl;
    std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl;
+    timecounter tc;
+    tc.tic();
    prepare_sources(mod, state, mpl);
    for(size_t i = 0; i < num_timesteps; i++)
    {
        mpl.call_timestep_callback(i);
-        timestep(state, ti);
+        timestep(state, mpl, ti);
        do_sources(mod, state, mpl);
        if (i%silo_output_rate == 0)
            export_fields_to_silo(mod, state, mpl);
+        swap(state);
        if (i%cycle_print_rate == 0)
        {
+            double time = tc.toc();
            std::cout << "Cycle " << i << ": t = " << state.curr_time << " s";
-            std::cout << std::endl;
+            std::cout << ", DOFs/s: " << (mod.num_dofs()*6*cycle_print_rate)/(time) << std::endl;
+            tc.tic();
        }
-        swap(state);
    }
 }