From ba874e275b4dfd04c8d14d9fb5a1c2b4548893ed Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Thu, 13 May 2021 17:40:18 +0200
Subject: [PATCH] Implemented leapfrog on GPU.

---
 include/kernels_gpu.h       |  11 ++
 include/maxwell_interface.h |  15 ++-
 src/kernels_cuda.cu         |  60 +++++++++
 src/maxwell_gpu.cpp         | 135 +++++++++++++++++++-
 src/maxwell_kernels_cuda.cu | 244 ++++++++++++++++++++++++++++++++++--
 5 files changed, 453 insertions(+), 12 deletions(-)

diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h
index fbf43a5..4c0df22 100644
--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -172,3 +172,14 @@ gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
     const double *k2, const double *k3, const double *k4, const double *reaction,
     const double *material, size_t num_dofs, double dt,
     gpuStream_t stream = 0);
+
+void
+gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe,
+    const double *tmp, const double *ie, size_t num_dofs, double dt,
+    cudaStream_t stream = 0);
+
+void
+gpu_compute_leapfrog_update(double *next, const double *curr,
+    const double *tmp, const double *im, size_t num_dofs, double dt,
+    cudaStream_t stream = 0);
+
diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h
index 7e1f65d..c5740c2 100644
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -363,12 +363,25 @@ init_H_field(const model& mod, solver_state_gpu& state, const Function& H)
     state.emf_curr.Hz.copyin(Hz.data(), Hz.size());
 }
 
+
+#endif /* ENABLE_GPU_SOLVER */
+
 void gpu_compute_jumps(const entity_data_gpu&, const field_gpu&, field_gpu&,
     double *, gpuStream_t stream = 0);
+
+void gpu_compute_jumps_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
+    double *, gpuStream_t stream = 0);
+
+void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
+    double *, gpuStream_t stream = 0);
+
 void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&,
     const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
-#endif /* ENABLE_GPU_SOLVER */
 
+void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
+    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
 
+void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
+    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
 
 } // namespace maxwell
diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu
index 744b0c1..99918c5 100644
--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -435,3 +435,63 @@ gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
         gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS, 0, stream>>>(out, in, k1, k2, k3, k4,
             reaction, material, num_dofs, dt);
 }
+
+
+void __global__
+gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict__ curr,
+    const double * __restrict__ soe, const double * __restrict__ tmp,
+    const double * __restrict__ ie, size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+    {
+        double dt_sigma_over_2eps = 0.5*dt*soe[dof];
+        double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps);
+        double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps);
+
+        next[dof] = curr[dof]*CR + tmp[dof]*CC;
+    }
+}
+
+void
+gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe,
+    const double *tmp, const double *ie, size_t num_dofs, double dt,
+    cudaStream_t stream)
+{
+    static const size_t EULER_THREADS = 256;
+
+    auto gs = num_dofs/EULER_THREADS;
+    if (num_dofs % EULER_THREADS != 0)
+        gs += 1;
+
+    gpu_leapfrog_update_kernel<<<gs, EULER_THREADS, 0, stream>>>(next, curr, soe,
+        tmp, ie, num_dofs, dt);
+}
+
+void __global__
+gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict__ curr,
+    const double * __restrict__ tmp, const double * __restrict__ im,
+    size_t num_dofs, double dt)
+{
+    int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (dof < num_dofs)
+    {
+        double CC = dt*im[dof];
+        next[dof] = curr[dof] + tmp[dof]*CC;
+    }
+}
+
+void
+gpu_compute_leapfrog_update(double *next, const double *curr,
+    const double *tmp, const double *im, size_t num_dofs, double dt,
+    cudaStream_t stream)
+{
+    static const size_t EULER_THREADS = 256;
+
+    auto gs = num_dofs/EULER_THREADS;
+    if (num_dofs % EULER_THREADS != 0)
+        gs += 1;
+
+    gpu_leapfrog_update_kernel<<<gs, EULER_THREADS, 0, stream>>>(next, curr,
+        tmp, im, num_dofs, dt);
+}
diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp
index 9e154ff..48c6dcd 100644
--- a/src/maxwell_gpu.cpp
+++ b/src/maxwell_gpu.cpp
@@ -15,14 +15,13 @@ void
 init_matparams(const model& mod, solver_state_gpu& state,
     const parameter_loader& mpl)
 {
+    state.matparams.num_dofs = mod.num_dofs();
+    state.matparams.num_fluxes = mod.num_fluxes();
     vecxd inv_eps = vecxd::Zero( mod.num_dofs() );
     vecxd inv_mu = vecxd::Zero( mod.num_dofs() );
     vecxd sigma = vecxd::Zero( mod.num_dofs() );
     vecxd sigma_over_eps = vecxd::Zero( mod.num_dofs() );
 
-    vecxd Z = vecxd::Zero( mod.num_dofs() );
-    vecxd Y = vecxd::Zero( mod.num_dofs() );
-
     for (auto& e : mod)
     {
         auto tag = e.gmsh_tag();
@@ -126,6 +125,50 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
     gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
 }
 
+static void
+compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+    auto dxp = state.dx.data();
+    auto dyp = state.dy.data();
+    auto dzp = state.dz.data();
+
+    for (const auto& edg : state.edgs)
+    {
+        gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream);
+        gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream);
+        gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream);
+    }
+
+    
+    gpu_subtract(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
+    gpu_subtract(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
+    gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
+}
+
+static void
+compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
+{
+    auto currp = curr.data();
+    auto nextp = next.data();
+    auto dxp = state.dx.data();
+    auto dyp = state.dy.data();
+    auto dzp = state.dz.data();
+
+    for (const auto& edg : state.edgs)
+    {
+        gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream);
+        gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream);
+        gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
+    }
+
+    
+    gpu_subtract(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
+    gpu_subtract(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
+    gpu_subtract(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
+}
+
 static void
 compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
 {
@@ -153,6 +196,87 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
     }
 }
 
+static void
+compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+{
+    auto matparams_ptrs = state.matparams.data();
+    double *bc_coeffs = matparams_ptrs.bc_coeffs;
+
+    for (const auto& edg : state.edgs)
+        maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+
+    for (const auto& edg : state.edgs)
+        maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
+            state.matparams, state.compute_stream);
+
+    auto fluxesp = state.fluxes.data();
+    auto outp = out.data();
+
+    for (auto& edg : state.edgs)
+    {
+        gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
+        gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
+        gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
+    }
+}
+
+static void
+compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
+{
+    auto matparams_ptrs = state.matparams.data();
+    double *bc_coeffs = matparams_ptrs.bc_coeffs;
+
+    for (const auto& edg : state.edgs)
+        maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+
+    for (const auto& edg : state.edgs)
+        maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
+            state.matparams, state.compute_stream);
+
+    auto fluxesp = state.fluxes.data();
+    auto outp = out.data();
+
+    for (auto& edg : state.edgs)
+    {
+        gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
+        gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
+        gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
+    }
+}
+
+static void
+leapfrog(solver_state_gpu& state)
+{
+    double *r = state.matparams.sigma_over_epsilon.data();
+    double *eps = state.matparams.inv_epsilon.data();
+    double *mu = state.matparams.inv_mu.data();
+
+    auto currp = state.emf_curr.data();
+    auto nextp = state.emf_next.data();
+    auto tmpp = state.tmp.data();
+
+    auto dt = state.delta_t;
+    compute_curls_H(state, state.emf_curr, state.tmp);
+    compute_fluxes_H(state, state.emf_curr, state.tmp);
+    
+    gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
+        dt, state.compute_stream);
+    gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
+        dt, state.compute_stream);
+    gpu_compute_leapfrog_update(nextp.Ez, currp.Ez, r, tmpp.Ez, eps, nextp.num_dofs,
+        dt, state.compute_stream);
+
+    compute_curls_E(state, state.emf_next, state.tmp);
+    compute_fluxes_E(state, state.emf_next, state.tmp);
+
+    gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
+        dt, state.compute_stream);
+    gpu_compute_leapfrog_update(nextp.Hy, currp.Hy, tmpp.Hy, mu, nextp.num_dofs,
+        dt, state.compute_stream);
+    gpu_compute_leapfrog_update(nextp.Hz, currp.Hz, tmpp.Hz, mu, nextp.num_dofs,
+        dt, state.compute_stream);
+}
+
 static void
 apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
 {
@@ -231,6 +355,11 @@ timestep(solver_state_gpu& state, time_integrator_type ti)
         compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
     }
 
+    if (ti == time_integrator_type::LEAPFROG)
+    {
+        leapfrog(state);
+    }
+
     state.curr_time += state.delta_t;
     state.curr_timestep += 1;
 
diff --git a/src/maxwell_kernels_cuda.cu b/src/maxwell_kernels_cuda.cu
index 384983b..91999ca 100644
--- a/src/maxwell_kernels_cuda.cu
+++ b/src/maxwell_kernels_cuda.cu
@@ -86,6 +86,70 @@ gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& ju
     checkGPU(cudaPeekAtLastError());
 }
 
+void
+gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, cudaStream_t stream)
+{
+    static const size_t JUMP_THREADS = 256;
+
+    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
+
+    auto gs = num_all_fluxes/JUMP_THREADS;
+    if (num_all_fluxes % JUMP_THREADS != 0)
+        gs += 1;
+
+    dim3 grid_size(gs);
+    dim3 threads_per_block(JUMP_THREADS);
+
+    /* Compute E-field jumps */
+    gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+    
+    gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+
+    gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+}
+
+void
+gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, cudaStream_t stream)
+{
+    static const size_t JUMP_THREADS = 256;
+
+    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
+
+    auto gs = num_all_fluxes/JUMP_THREADS;
+    if (num_all_fluxes % JUMP_THREADS != 0)
+        gs += 1;
+
+    dim3 grid_size(gs);
+    dim3 threads_per_block(JUMP_THREADS);
+
+    /* Compute H-field jumps */
+    gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+    
+    gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+
+    gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+}
+
 template<size_t K>
 __global__ void
 gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
@@ -131,8 +195,84 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_
 }
 
 template<size_t K>
+__global__ void
+gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
+    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
+    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
+    size_t flux_base, size_t num_fluxes)
+{
+    using KS = kernel_gpu_sizes<K>;
+    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
+    auto global_dof_pos = flux_base + local_dof_pos;
+
+    auto entry_num = local_dof_pos/KS::num_fluxes;
+
+    if (local_dof_pos >= num_fluxes)
+        return;
+
+    auto face_det = face_dets[entry_num];
+    auto nx = face_normals[3*entry_num + 0];
+    auto ny = face_normals[3*entry_num + 1];
+    auto nz = face_normals[3*entry_num + 2];
+
+    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
+    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
+    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
+    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
+    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
+    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
+
+    auto ndotE = nx*jEx + ny*jEy + nz*jEz;
+
+    auto aE = face_det * fcp.aE[global_dof_pos];
+    auto bE = face_det * fcp.bE[global_dof_pos];
+ 
+    flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
+    flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
+    flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
+}
+
+template<size_t K>
+__global__ void
+gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
+    field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
+    const double * __restrict__ face_dets, const double * __restrict__ face_normals,
+    size_t flux_base, size_t num_fluxes)
+{
+    using KS = kernel_gpu_sizes<K>;
+    auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
+    auto global_dof_pos = flux_base + local_dof_pos;
+
+    auto entry_num = local_dof_pos/KS::num_fluxes;
+
+    if (local_dof_pos >= num_fluxes)
+        return;
+
+    auto face_det = face_dets[entry_num];
+    auto nx = face_normals[3*entry_num + 0];
+    auto ny = face_normals[3*entry_num + 1];
+    auto nz = face_normals[3*entry_num + 2];
+
+    auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
+    auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
+    auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
+    auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
+    auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
+    auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
+
+    auto ndotH = nx*jHx + ny*jHy + nz*jHz;
+    
+    auto aH = face_det * fcp.aH[global_dof_pos];
+    auto bH = face_det * fcp.bH[global_dof_pos];
+ 
+    flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
+    flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
+    flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
+}
+
+template<size_t K, typename Kernel>
 void
-launch_compute_fluxes_kernel(const entity_data_gpu& edg, const field_gpu& jumps,
+launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
     field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
     cudaStream_t stream)
 {
@@ -150,7 +290,7 @@ launch_compute_fluxes_kernel(const entity_data_gpu& edg, const field_gpu& jumps,
     auto pp = fcp.data();
     auto bs = bndsrc.data();
 
-    gpu_compute_fluxes_kernel_planar<K><<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
+    kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
         edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
         num_all_fluxes);
 }
@@ -162,12 +302,100 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
 {
     switch( edg.a_order )
     {
-        case 1: launch_compute_fluxes_kernel<1>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
-        case 2: launch_compute_fluxes_kernel<2>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
-        case 3: launch_compute_fluxes_kernel<3>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
-        case 4: launch_compute_fluxes_kernel<4>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
-        case 5: launch_compute_fluxes_kernel<5>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
-        case 6: launch_compute_fluxes_kernel<6>(edg, jumps, fluxes, bndsrc, fcp, stream); break;
+        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        default: throw std::invalid_argument("unsupported order");
+    }
+}
+
+void
+gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
+    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
+    cudaStream_t stream)
+{
+    switch( edg.a_order )
+    {
+        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        default: throw std::invalid_argument("unsupported order");
+    }
+}
+
+void
+gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
+    field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
+    cudaStream_t stream)
+{
+    switch( edg.a_order )
+    {
+        case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+        
+        case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
+        case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>,
+            edg, jumps, fluxes, bndsrc, fcp, stream);
+            break;
+
         default: throw std::invalid_argument("unsupported order");
     }
 }
-- 
GitLab