diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h index 8277b482fa9a55da8a26192f457e54697b5465d8..d5f7626fe4a4dfef5ab8288ed7f2e00204c354b5 100644 --- a/include/maxwell_interface.h +++ b/include/maxwell_interface.h @@ -314,7 +314,7 @@ void init_from_model(const model&, solver_state&); void init_matparams(const model&, solver_state&, const parameter_loader&); //void apply_operator(solver_state&, const field&, field&); void export_fields_to_silo(const model&, const solver_state&, const parameter_loader&); -void timestep(solver_state&, time_integrator_type); +void timestep(solver_state&, const parameter_loader&, time_integrator_type); void prepare_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&); void do_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&); void swap(maxwell::solver_state&); @@ -324,7 +324,7 @@ void init_from_model(const model&, solver_state_gpu&); void init_matparams(const model&, solver_state_gpu&, const parameter_loader&); //void apply_operator(solver_state_gpu&, const field_gpu&, field_gpu&); void export_fields_to_silo(const model&, const solver_state_gpu&, const parameter_loader&); -void timestep(solver_state_gpu&, time_integrator_type); +void timestep(solver_state_gpu&, const parameter_loader&, time_integrator_type); void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&); void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&); void swap(maxwell::solver_state_gpu&); @@ -398,13 +398,13 @@ void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&, double *, gpuStream_t stream = 0); void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&, - const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); + const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0); void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&, - const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); + const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0); void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&, - const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); + const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0); void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, field_gpu& srcs); diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 2c88604fc3818c5dc1cd3c66e3055529034aec4f..050d27fa568239409f1c1de837aa73bccfd34993 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -227,7 +227,7 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex, for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) { int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; - int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes); + int32_t f_ofs = elem_flux_base + dof; acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs); } @@ -304,7 +304,7 @@ void gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs, cudaStream_t stream) { - static const size_t SUBTRACT_THREADS = 256; + static const size_t SUBTRACT_THREADS = 128; auto gs = num_dofs/SUBTRACT_THREADS; if (num_dofs % SUBTRACT_THREADS != 0) gs += 1; @@ -326,7 +326,7 @@ void gpu_curl(double *dst, const double *add, const double *sub, const double *source, size_t num_dofs, cudaStream_t stream) { - static const size_t SUBTRACT_THREADS = 256; + static const size_t SUBTRACT_THREADS = 128; auto gs = num_dofs/SUBTRACT_THREADS; if (num_dofs % SUBTRACT_THREADS != 0) gs += 1; @@ -468,8 +468,9 @@ gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict if (dof < num_dofs) { double dt_sigma_over_2eps = 0.5*dt*soe[dof]; - double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps); - double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps); + double k = 1./(1.0 + dt_sigma_over_2eps); + double CR = (1.0 - dt_sigma_over_2eps)*k; + double CC = dt*ie[dof]*k; next[dof] = curr[dof]*CR + tmp[dof]*CC; } @@ -480,7 +481,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe, const double *tmp, const double *ie, size_t num_dofs, double dt, cudaStream_t stream) { - static const size_t EULER_THREADS = 256; + static const size_t EULER_THREADS = 128; auto gs = num_dofs/EULER_THREADS; if (num_dofs % EULER_THREADS != 0) @@ -508,7 +509,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *tmp, const double *im, size_t num_dofs, double dt, cudaStream_t stream) { - static const size_t EULER_THREADS = 256; + static const size_t EULER_THREADS = 128; auto gs = num_dofs/EULER_THREADS; if (num_dofs % EULER_THREADS != 0) diff --git a/src/maxwell_cpu.cpp b/src/maxwell_cpu.cpp index 7834684b72a9a442e0caefdf061a1965a3f427c8..4adfdecb5bfe840d586f9a69d6f5cd6927f13edb 100644 --- a/src/maxwell_cpu.cpp +++ b/src/maxwell_cpu.cpp @@ -505,7 +505,7 @@ leapfrog(solver_state& state) } void -timestep(solver_state& state, time_integrator_type ti) +timestep(solver_state& state, const parameter_loader&, time_integrator_type ti) { if (ti == time_integrator_type::EULER) { diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp index b1d5bad9711fe4929d2ed6c7c98442161ee4b986..df2409056898526477370ed1c13c0373da9fa769 100644 --- a/src/maxwell_gpu.cpp +++ b/src/maxwell_gpu.cpp @@ -150,7 +150,7 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& } static void -compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources) { auto currp = curr.data(); auto nextp = next.data(); @@ -172,9 +172,19 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) auto Jy = state.Jy_src_gpu.data(); auto Jz = state.Jz_src_gpu.data(); - gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); - gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); - gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); + if (use_sources) + { + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); + } + else + { + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); + } + gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); @@ -203,7 +213,7 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) } static void -compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources) { auto currp = curr.data(); auto nextp = next.data(); @@ -218,17 +228,26 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream); } - auto Jx = state.Jx_src_gpu.data(); - auto Jy = state.Jy_src_gpu.data(); - auto Jz = state.Jz_src_gpu.data(); + if (use_sources) + { + auto Jx = state.Jx_src_gpu.data(); + auto Jy = state.Jy_src_gpu.data(); + auto Jz = state.Jz_src_gpu.data(); + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); + } + else + { + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); + } - gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); - gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); - gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); } static void -compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) +compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources) { auto matparams_ptrs = state.matparams.data(); double *bc_coeffs = matparams_ptrs.bc_coeffs; @@ -238,7 +257,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) for (const auto& edg : state.edgs) maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs, - state.matparams, state.compute_stream); + state.matparams, use_sources, state.compute_stream); auto fluxesp = state.fluxes.data(); auto outp = out.data(); @@ -255,7 +274,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) } static void -compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) +compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources) { auto matparams_ptrs = state.matparams.data(); double *bc_coeffs = matparams_ptrs.bc_coeffs; @@ -265,7 +284,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) for (const auto& edg : state.edgs) maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs, - state.matparams, state.compute_stream); + state.matparams, use_sources, state.compute_stream); auto fluxesp = state.fluxes.data(); auto outp = out.data(); @@ -279,7 +298,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) } static void -compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) +compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources) { auto matparams_ptrs = state.matparams.data(); double *bc_coeffs = matparams_ptrs.bc_coeffs; @@ -289,7 +308,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) for (const auto& edg : state.edgs) maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs, - state.matparams, state.compute_stream); + state.matparams, use_sources, state.compute_stream); auto fluxesp = state.fluxes.data(); auto outp = out.data(); @@ -303,7 +322,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) } static void -leapfrog(solver_state_gpu& state) +leapfrog(solver_state_gpu& state, const parameter_loader& mpl) { double *r = state.matparams.sigma_over_epsilon.data(); double *eps = state.matparams.inv_epsilon.data(); @@ -313,10 +332,14 @@ leapfrog(solver_state_gpu& state) auto nextp = state.emf_next.data(); auto tmpp = state.tmp.data(); - auto dt = state.delta_t; - compute_curls_H(state, state.emf_curr, state.tmp); - compute_fluxes_H(state, state.emf_curr, state.tmp); + bool ve = mpl.volume_sources_enabled(); + compute_curls_H(state, state.emf_curr, state.tmp, ve); + + auto be = mpl.boundary_sources_enabled(); + auto ie = mpl.interface_sources_enabled(); + compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie); + auto dt = state.delta_t; gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs, dt, state.compute_stream); gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs, @@ -325,7 +348,7 @@ leapfrog(solver_state_gpu& state) dt, state.compute_stream); compute_curls_E(state, state.emf_next, state.tmp); - compute_fluxes_E(state, state.emf_next, state.tmp); + compute_fluxes_E(state, state.emf_next, state.tmp, be or ie); gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs, dt, state.compute_stream); @@ -336,10 +359,14 @@ leapfrog(solver_state_gpu& state) } static void -apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) +apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr, + field_gpu& next) { - compute_curls(state, curr, next); - compute_fluxes(state, curr, next); + bool ve = mpl.volume_sources_enabled(); + auto be = mpl.boundary_sources_enabled(); + auto ie = mpl.interface_sources_enabled(); + compute_curls(state, curr, next, ve); + compute_fluxes(state, curr, next, be or ie); } static void @@ -386,36 +413,36 @@ compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in, } void -timestep(solver_state_gpu& state, time_integrator_type ti) +timestep(solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti) { //timecounter tc; //tc.tic(); if (ti == time_integrator_type::EULER) { - apply_operator(state, state.emf_curr, state.tmp); + apply_operator(state, mpl, state.emf_curr, state.tmp); compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next); } if (ti == time_integrator_type::RK4) { - apply_operator(state, state.emf_curr, state.k1); + apply_operator(state, mpl, state.emf_curr, state.k1); compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp); - apply_operator(state, state.tmp, state.k2); + apply_operator(state, mpl, state.tmp, state.k2); compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp); - apply_operator(state, state.tmp, state.k3); - + apply_operator(state, mpl, state.tmp, state.k3); + compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp); - apply_operator(state, state.tmp, state.k4); + apply_operator(state, mpl, state.tmp, state.k4); compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next); } if (ti == time_integrator_type::LEAPFROG) { - leapfrog(state); + leapfrog(state, mpl); } state.curr_time += state.delta_t; @@ -483,6 +510,9 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, state.Jx_src_gpu.zero(); state.Jy_src_gpu.zero(); state.Jz_src_gpu.zero(); + state.Jx_src_gpu_buf.zero(); + state.Jy_src_gpu_buf.zero(); + state.Jz_src_gpu_buf.zero(); state.bndsrcs_decomp_cpu.zero(); state.bndsrcs_cpu.zero(); mpl.source_was_cleared(); @@ -510,13 +540,18 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, } void -swap(maxwell::solver_state_gpu& state) +swap(solver_state_gpu& state) { checkGPU( gpuDeviceSynchronize() ); + + //auto be = mpl.boundary_sources_enabled(); + //auto ie = mpl.interface_sources_enabled() + //if ( be or ie ) + decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); + std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf); std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf); std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf); - decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); std::swap(state.emf_curr, state.emf_next); } diff --git a/src/maxwell_kernels_cuda.cu b/src/maxwell_kernels_cuda.cu index 7365e927a56e267686e42e6b4c59ffbd8c2ad7c7..3e7d848f49dd24dd1f4f2fec98b0aac68a7d3022 100644 --- a/src/maxwell_kernels_cuda.cu +++ b/src/maxwell_kernels_cuda.cu @@ -42,7 +42,7 @@ void gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, double *bc_coeffs, cudaStream_t stream) { - static const size_t JUMP_THREADS = 256; + static const size_t JUMP_THREADS = 128; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; @@ -90,7 +90,7 @@ void gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, double *bc_coeffs, cudaStream_t stream) { - static const size_t JUMP_THREADS = 256; + static const size_t JUMP_THREADS = 128; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; @@ -122,7 +122,7 @@ void gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, double *bc_coeffs, cudaStream_t stream) { - static const size_t JUMP_THREADS = 256; + static const size_t JUMP_THREADS = 128; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; @@ -150,12 +150,12 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& checkGPU(cudaPeekAtLastError()); } -template<size_t K> +template<size_t K, bool do_E, bool do_H> __global__ void gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, const double * __restrict__ face_dets, const double * __restrict__ face_normals, - size_t flux_base, size_t num_fluxes) + size_t flux_base, size_t num_fluxes, bool use_sources) { using KS = kernel_gpu_sizes<K>; auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; @@ -171,112 +171,51 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ auto ny = face_normals[3*entry_num + 1]; auto nz = face_normals[3*entry_num + 2]; - auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; - auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; - auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; - auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; - auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; - auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; + auto jEx = jump.Ex[global_dof_pos]; + auto jEy = jump.Ey[global_dof_pos]; + auto jEz = jump.Ez[global_dof_pos]; + auto jHx = jump.Hx[global_dof_pos]; + auto jHy = jump.Hy[global_dof_pos]; + auto jHz = jump.Hz[global_dof_pos]; - auto ndotE = nx*jEx + ny*jEy + nz*jEz; - auto ndotH = nx*jHx + ny*jHy + nz*jHz; - - auto aE = face_det * fcp.aE[global_dof_pos]; - auto bE = face_det * fcp.bE[global_dof_pos]; - auto aH = face_det * fcp.aH[global_dof_pos]; - auto bH = face_det * fcp.bH[global_dof_pos]; - - flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx); - flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy); - flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz); - flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx); - flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy); - flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz); -} - -template<size_t K> -__global__ void -gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, - field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, - const double * __restrict__ face_dets, const double * __restrict__ face_normals, - size_t flux_base, size_t num_fluxes) -{ - using KS = kernel_gpu_sizes<K>; - auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; - auto global_dof_pos = flux_base + local_dof_pos; - - auto entry_num = local_dof_pos/KS::num_fluxes; - - if (local_dof_pos >= num_fluxes) - return; - - auto face_det = face_dets[entry_num]; - auto nx = face_normals[3*entry_num + 0]; - auto ny = face_normals[3*entry_num + 1]; - auto nz = face_normals[3*entry_num + 2]; - - auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; - auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; - auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; - auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; - auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; - auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; - - auto ndotE = nx*jEx + ny*jEy + nz*jEz; - - auto aE = face_det * fcp.aE[global_dof_pos]; - auto bE = face_det * fcp.bE[global_dof_pos]; - - flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx); - flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy); - flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz); -} - -template<size_t K> -__global__ void -gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, - field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, - const double * __restrict__ face_dets, const double * __restrict__ face_normals, - size_t flux_base, size_t num_fluxes) -{ - using KS = kernel_gpu_sizes<K>; - auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; - auto global_dof_pos = flux_base + local_dof_pos; - - auto entry_num = local_dof_pos/KS::num_fluxes; - - if (local_dof_pos >= num_fluxes) - return; - - auto face_det = face_dets[entry_num]; - auto nx = face_normals[3*entry_num + 0]; - auto ny = face_normals[3*entry_num + 1]; - auto nz = face_normals[3*entry_num + 2]; + if (use_sources) + { + jEx += - bndsrc.Ex[global_dof_pos]; + jEy += - bndsrc.Ey[global_dof_pos]; + jEz += - bndsrc.Ez[global_dof_pos]; + jHx += + bndsrc.Hx[global_dof_pos]; + jHy += + bndsrc.Hy[global_dof_pos]; + jHz += + bndsrc.Hz[global_dof_pos]; + } - auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; - auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; - auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; - auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; - auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; - auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; + if constexpr(do_E) + { + auto ndotE = nx*jEx + ny*jEy + nz*jEz; + auto aE = face_det * fcp.aE[global_dof_pos]; + auto bE = face_det * fcp.bE[global_dof_pos]; + flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx); + flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy); + flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz); + } - auto ndotH = nx*jHx + ny*jHy + nz*jHz; - - auto aH = face_det * fcp.aH[global_dof_pos]; - auto bH = face_det * fcp.bH[global_dof_pos]; - - flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx); - flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy); - flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz); + if constexpr(do_H) + { + auto ndotH = nx*jHx + ny*jHy + nz*jHz; + auto aH = face_det * fcp.aH[global_dof_pos]; + auto bH = face_det * fcp.bH[global_dof_pos]; + flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx); + flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy); + flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz); + } } template<size_t K, typename Kernel> void launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps, - field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, + field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, bool use_sources, cudaStream_t stream) { - static const size_t FLUX_THREADS = 256; + static const size_t FLUX_THREADS = 1024; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; @@ -292,38 +231,50 @@ launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const f kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp, edg.face_determinants.data(), edg.normals.data(), edg.flux_base, - num_all_fluxes); + num_all_fluxes, use_sources); } void gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, - cudaStream_t stream) + bool use_sources, cudaStream_t stream) { switch( edg.a_order ) { - case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 1: launch_compute_fluxes_kernel<1>( + gpu_compute_fluxes_kernel_planar<1, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; - case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 2: launch_compute_fluxes_kernel<2>( + gpu_compute_fluxes_kernel_planar<2, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; - case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 3: launch_compute_fluxes_kernel<3>( + gpu_compute_fluxes_kernel_planar<3, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; - case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 4: launch_compute_fluxes_kernel<4>( + gpu_compute_fluxes_kernel_planar<4, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; - case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 5: launch_compute_fluxes_kernel<5>( + gpu_compute_fluxes_kernel_planar<5, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; - case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>, - edg, jumps, fluxes, bndsrc, fcp, stream); + case 6: launch_compute_fluxes_kernel<6>( + gpu_compute_fluxes_kernel_planar<6, true, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); break; default: throw std::invalid_argument("unsupported order"); @@ -333,33 +284,45 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps, void gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, - cudaStream_t stream) + bool use_sources, cudaStream_t stream) { switch( edg.a_order ) { - case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; + case 1: launch_compute_fluxes_kernel<1>( + gpu_compute_fluxes_kernel_planar<1, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 2: launch_compute_fluxes_kernel<2>( + gpu_compute_fluxes_kernel_planar<2, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 3: launch_compute_fluxes_kernel<3>( + gpu_compute_fluxes_kernel_planar<3, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 4: launch_compute_fluxes_kernel<4>( + gpu_compute_fluxes_kernel_planar<4, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 5: launch_compute_fluxes_kernel<5>( + gpu_compute_fluxes_kernel_planar<5, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 6: launch_compute_fluxes_kernel<6>( + gpu_compute_fluxes_kernel_planar<6, true, false>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; default: throw std::invalid_argument("unsupported order"); } @@ -368,33 +331,45 @@ gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps, void gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, - cudaStream_t stream) + bool use_sources, cudaStream_t stream) { switch( edg.a_order ) { - case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; - - case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>, - edg, jumps, fluxes, bndsrc, fcp, stream); - break; + case 1: launch_compute_fluxes_kernel<1>( + gpu_compute_fluxes_kernel_planar<1, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 2: launch_compute_fluxes_kernel<2>( + gpu_compute_fluxes_kernel_planar<2, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 3: launch_compute_fluxes_kernel<3>( + gpu_compute_fluxes_kernel_planar<3, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 4: launch_compute_fluxes_kernel<4>( + gpu_compute_fluxes_kernel_planar<4, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 5: launch_compute_fluxes_kernel<5>( + gpu_compute_fluxes_kernel_planar<5, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; + + case 6: launch_compute_fluxes_kernel<6>( + gpu_compute_fluxes_kernel_planar<6, false, true>, + edg, jumps, fluxes, bndsrc, fcp, use_sources, stream + ); + break; default: throw std::invalid_argument("unsupported order"); } diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp index 69ee2b95ce30a2e91c9bebd66be8b562e1874b52..78c09f6c7d4164897e9e0513c29d42c8f1e35070 100644 --- a/src/maxwell_solver.cpp +++ b/src/maxwell_solver.cpp @@ -14,6 +14,7 @@ #include "maxwell_interface.h" #include "maxwell_common.h" #include "param_loader.h" +#include "timecounter.h" #if 0 static void @@ -86,23 +87,28 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl) std::cout << "s each." << std::endl; std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl; + timecounter tc; + tc.tic(); prepare_sources(mod, state, mpl); for(size_t i = 0; i < num_timesteps; i++) { mpl.call_timestep_callback(i); - timestep(state, ti); + timestep(state, mpl, ti); do_sources(mod, state, mpl); if (i%silo_output_rate == 0) export_fields_to_silo(mod, state, mpl); + swap(state); + if (i%cycle_print_rate == 0) { + double time = tc.toc(); std::cout << "Cycle " << i << ": t = " << state.curr_time << " s"; - std::cout << std::endl; + std::cout << ", DOFs/s: " << (mod.num_dofs()*6*cycle_print_rate)/(time) << std::endl; + tc.tic(); } - swap(state); } }