From ba874e275b4dfd04c8d14d9fb5a1c2b4548893ed Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Thu, 13 May 2021 17:40:18 +0200 Subject: [PATCH] Implemented leapfrog on GPU. --- include/kernels_gpu.h | 11 ++ include/maxwell_interface.h | 15 ++- src/kernels_cuda.cu | 60 +++++++++ src/maxwell_gpu.cpp | 135 +++++++++++++++++++- src/maxwell_kernels_cuda.cu | 244 ++++++++++++++++++++++++++++++++++-- 5 files changed, 453 insertions(+), 12 deletions(-) diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index fbf43a5..4c0df22 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -172,3 +172,14 @@ gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, const double *k2, const double *k3, const double *k4, const double *reaction, const double *material, size_t num_dofs, double dt, gpuStream_t stream = 0); + +void +gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe, + const double *tmp, const double *ie, size_t num_dofs, double dt, + cudaStream_t stream = 0); + +void +gpu_compute_leapfrog_update(double *next, const double *curr, + const double *tmp, const double *im, size_t num_dofs, double dt, + cudaStream_t stream = 0); + diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h index 7e1f65d..c5740c2 100644 --- a/include/maxwell_interface.h +++ b/include/maxwell_interface.h @@ -363,12 +363,25 @@ init_H_field(const model& mod, solver_state_gpu& state, const Function& H) state.emf_curr.Hz.copyin(Hz.data(), Hz.size()); } + +#endif /* ENABLE_GPU_SOLVER */ + void gpu_compute_jumps(const entity_data_gpu&, const field_gpu&, field_gpu&, double *, gpuStream_t stream = 0); + +void gpu_compute_jumps_E(const entity_data_gpu&, const field_gpu&, field_gpu&, + double *, gpuStream_t stream = 0); + +void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&, + double *, gpuStream_t stream = 0); + void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&, const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); -#endif /* ENABLE_GPU_SOLVER */ +void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&, + const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); +void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&, + const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); } // namespace maxwell diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 744b0c1..99918c5 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -435,3 +435,63 @@ gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS, 0, stream>>>(out, in, k1, k2, k3, k4, reaction, material, num_dofs, dt); } + + +void __global__ +gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict__ curr, + const double * __restrict__ soe, const double * __restrict__ tmp, + const double * __restrict__ ie, size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + { + double dt_sigma_over_2eps = 0.5*dt*soe[dof]; + double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps); + double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps); + + next[dof] = curr[dof]*CR + tmp[dof]*CC; + } +} + +void +gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe, + const double *tmp, const double *ie, size_t num_dofs, double dt, + cudaStream_t stream) +{ + static const size_t EULER_THREADS = 256; + + auto gs = num_dofs/EULER_THREADS; + if (num_dofs % EULER_THREADS != 0) + gs += 1; + + gpu_leapfrog_update_kernel<<<gs, EULER_THREADS, 0, stream>>>(next, curr, soe, + tmp, ie, num_dofs, dt); +} + +void __global__ +gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict__ curr, + const double * __restrict__ tmp, const double * __restrict__ im, + size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + { + double CC = dt*im[dof]; + next[dof] = curr[dof] + tmp[dof]*CC; + } +} + +void +gpu_compute_leapfrog_update(double *next, const double *curr, + const double *tmp, const double *im, size_t num_dofs, double dt, + cudaStream_t stream) +{ + static const size_t EULER_THREADS = 256; + + auto gs = num_dofs/EULER_THREADS; + if (num_dofs % EULER_THREADS != 0) + gs += 1; + + gpu_leapfrog_update_kernel<<<gs, EULER_THREADS, 0, stream>>>(next, curr, + tmp, im, num_dofs, dt); +} diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp index 9e154ff..48c6dcd 100644 --- a/src/maxwell_gpu.cpp +++ b/src/maxwell_gpu.cpp @@ -15,14 +15,13 @@ void init_matparams(const model& mod, solver_state_gpu& state, const parameter_loader& mpl) { + state.matparams.num_dofs = mod.num_dofs(); + state.matparams.num_fluxes = mod.num_fluxes(); vecxd inv_eps = vecxd::Zero( mod.num_dofs() ); vecxd inv_mu = vecxd::Zero( mod.num_dofs() ); vecxd sigma = vecxd::Zero( mod.num_dofs() ); vecxd sigma_over_eps = vecxd::Zero( mod.num_dofs() ); - vecxd Z = vecxd::Zero( mod.num_dofs() ); - vecxd Y = vecxd::Zero( mod.num_dofs() ); - for (auto& e : mod) { auto tag = e.gmsh_tag(); @@ -126,6 +125,50 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); } +static void +compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +{ + auto currp = curr.data(); + auto nextp = next.data(); + auto dxp = state.dx.data(); + auto dyp = state.dy.data(); + auto dzp = state.dz.data(); + + for (const auto& edg : state.edgs) + { + gpu_compute_field_derivatives(edg, currp.Ex, dxp.Ex, dyp.Ex, dzp.Ex, 1.0, state.compute_stream); + gpu_compute_field_derivatives(edg, currp.Ey, dxp.Ey, dyp.Ey, dzp.Ey, 1.0, state.compute_stream); + gpu_compute_field_derivatives(edg, currp.Ez, dxp.Ez, dyp.Ez, dzp.Ez, 1.0, state.compute_stream); + } + + + gpu_subtract(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); + gpu_subtract(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); + gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); +} + +static void +compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +{ + auto currp = curr.data(); + auto nextp = next.data(); + auto dxp = state.dx.data(); + auto dyp = state.dy.data(); + auto dzp = state.dz.data(); + + for (const auto& edg : state.edgs) + { + gpu_compute_field_derivatives(edg, currp.Hx, dxp.Hx, dyp.Hx, dzp.Hx, 1.0, state.compute_stream); + gpu_compute_field_derivatives(edg, currp.Hy, dxp.Hy, dyp.Hy, dzp.Hy, 1.0, state.compute_stream); + gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream); + } + + + gpu_subtract(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream); + gpu_subtract(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); + gpu_subtract(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); +} + static void compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) { @@ -153,6 +196,87 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) } } +static void +compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) +{ + auto matparams_ptrs = state.matparams.data(); + double *bc_coeffs = matparams_ptrs.bc_coeffs; + + for (const auto& edg : state.edgs) + maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream); + + for (const auto& edg : state.edgs) + maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs, + state.matparams, state.compute_stream); + + auto fluxesp = state.fluxes.data(); + auto outp = out.data(); + + for (auto& edg : state.edgs) + { + gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream); + gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream); + gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream); + } +} + +static void +compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) +{ + auto matparams_ptrs = state.matparams.data(); + double *bc_coeffs = matparams_ptrs.bc_coeffs; + + for (const auto& edg : state.edgs) + maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream); + + for (const auto& edg : state.edgs) + maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs, + state.matparams, state.compute_stream); + + auto fluxesp = state.fluxes.data(); + auto outp = out.data(); + + for (auto& edg : state.edgs) + { + gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream); + gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream); + gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream); + } +} + +static void +leapfrog(solver_state_gpu& state) +{ + double *r = state.matparams.sigma_over_epsilon.data(); + double *eps = state.matparams.inv_epsilon.data(); + double *mu = state.matparams.inv_mu.data(); + + auto currp = state.emf_curr.data(); + auto nextp = state.emf_next.data(); + auto tmpp = state.tmp.data(); + + auto dt = state.delta_t; + compute_curls_H(state, state.emf_curr, state.tmp); + compute_fluxes_H(state, state.emf_curr, state.tmp); + + gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs, + dt, state.compute_stream); + gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs, + dt, state.compute_stream); + gpu_compute_leapfrog_update(nextp.Ez, currp.Ez, r, tmpp.Ez, eps, nextp.num_dofs, + dt, state.compute_stream); + + compute_curls_E(state, state.emf_next, state.tmp); + compute_fluxes_E(state, state.emf_next, state.tmp); + + gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs, + dt, state.compute_stream); + gpu_compute_leapfrog_update(nextp.Hy, currp.Hy, tmpp.Hy, mu, nextp.num_dofs, + dt, state.compute_stream); + gpu_compute_leapfrog_update(nextp.Hz, currp.Hz, tmpp.Hz, mu, nextp.num_dofs, + dt, state.compute_stream); +} + static void apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) { @@ -231,6 +355,11 @@ timestep(solver_state_gpu& state, time_integrator_type ti) compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next); } + if (ti == time_integrator_type::LEAPFROG) + { + leapfrog(state); + } + state.curr_time += state.delta_t; state.curr_timestep += 1; diff --git a/src/maxwell_kernels_cuda.cu b/src/maxwell_kernels_cuda.cu index 384983b..91999ca 100644 --- a/src/maxwell_kernels_cuda.cu +++ b/src/maxwell_kernels_cuda.cu @@ -86,6 +86,70 @@ gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& ju checkGPU(cudaPeekAtLastError()); } +void +gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, + double *bc_coeffs, cudaStream_t stream) +{ + static const size_t JUMP_THREADS = 256; + + auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; + + auto gs = num_all_fluxes/JUMP_THREADS; + if (num_all_fluxes % JUMP_THREADS != 0) + gs += 1; + + dim3 grid_size(gs); + dim3 threads_per_block(JUMP_THREADS); + + /* Compute E-field jumps */ + gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); + + gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); + + gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); +} + +void +gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, + double *bc_coeffs, cudaStream_t stream) +{ + static const size_t JUMP_THREADS = 256; + + auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; + + auto gs = num_all_fluxes/JUMP_THREADS; + if (num_all_fluxes % JUMP_THREADS != 0) + gs += 1; + + dim3 grid_size(gs); + dim3 threads_per_block(JUMP_THREADS); + + /* Compute H-field jumps */ + gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); + + gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); + + gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(), + edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(), + bc_coeffs, edg.flux_base, num_all_fluxes); + checkGPU(cudaPeekAtLastError()); +} + template<size_t K> __global__ void gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, @@ -131,8 +195,84 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ } template<size_t K> +__global__ void +gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, + field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, + const double * __restrict__ face_dets, const double * __restrict__ face_normals, + size_t flux_base, size_t num_fluxes) +{ + using KS = kernel_gpu_sizes<K>; + auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; + auto global_dof_pos = flux_base + local_dof_pos; + + auto entry_num = local_dof_pos/KS::num_fluxes; + + if (local_dof_pos >= num_fluxes) + return; + + auto face_det = face_dets[entry_num]; + auto nx = face_normals[3*entry_num + 0]; + auto ny = face_normals[3*entry_num + 1]; + auto nz = face_normals[3*entry_num + 2]; + + auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; + auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; + auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; + auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; + auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; + auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; + + auto ndotE = nx*jEx + ny*jEy + nz*jEz; + + auto aE = face_det * fcp.aE[global_dof_pos]; + auto bE = face_det * fcp.bE[global_dof_pos]; + + flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx); + flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy); + flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz); +} + +template<size_t K> +__global__ void +gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, + field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, + const double * __restrict__ face_dets, const double * __restrict__ face_normals, + size_t flux_base, size_t num_fluxes) +{ + using KS = kernel_gpu_sizes<K>; + auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; + auto global_dof_pos = flux_base + local_dof_pos; + + auto entry_num = local_dof_pos/KS::num_fluxes; + + if (local_dof_pos >= num_fluxes) + return; + + auto face_det = face_dets[entry_num]; + auto nx = face_normals[3*entry_num + 0]; + auto ny = face_normals[3*entry_num + 1]; + auto nz = face_normals[3*entry_num + 2]; + + auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; + auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; + auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; + auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; + auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; + auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; + + auto ndotH = nx*jHx + ny*jHy + nz*jHz; + + auto aH = face_det * fcp.aH[global_dof_pos]; + auto bH = face_det * fcp.bH[global_dof_pos]; + + flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx); + flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy); + flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz); +} + +template<size_t K, typename Kernel> void -launch_compute_fluxes_kernel(const entity_data_gpu& edg, const field_gpu& jumps, +launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, cudaStream_t stream) { @@ -150,7 +290,7 @@ launch_compute_fluxes_kernel(const entity_data_gpu& edg, const field_gpu& jumps, auto pp = fcp.data(); auto bs = bndsrc.data(); - gpu_compute_fluxes_kernel_planar<K><<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp, + kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp, edg.face_determinants.data(), edg.normals.data(), edg.flux_base, num_all_fluxes); } @@ -162,12 +302,100 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps, { switch( edg.a_order ) { - case 1: launch_compute_fluxes_kernel<1>(edg, jumps, fluxes, bndsrc, fcp, stream); break; - case 2: launch_compute_fluxes_kernel<2>(edg, jumps, fluxes, bndsrc, fcp, stream); break; - case 3: launch_compute_fluxes_kernel<3>(edg, jumps, fluxes, bndsrc, fcp, stream); break; - case 4: launch_compute_fluxes_kernel<4>(edg, jumps, fluxes, bndsrc, fcp, stream); break; - case 5: launch_compute_fluxes_kernel<5>(edg, jumps, fluxes, bndsrc, fcp, stream); break; - case 6: launch_compute_fluxes_kernel<6>(edg, jumps, fluxes, bndsrc, fcp, stream); break; + case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + default: throw std::invalid_argument("unsupported order"); + } +} + +void +gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps, + field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, + cudaStream_t stream) +{ + switch( edg.a_order ) + { + case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + default: throw std::invalid_argument("unsupported order"); + } +} + +void +gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps, + field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, + cudaStream_t stream) +{ + switch( edg.a_order ) + { + case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + + case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>, + edg, jumps, fluxes, bndsrc, fcp, stream); + break; + default: throw std::invalid_argument("unsupported order"); } } -- GitLab