diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h index c5740c2ee8503ffcba5ccff570fff24a8a0183ce..aec2ef6a834fc8305b2d9e23aa05b82bda26d1b6 100644 --- a/include/maxwell_interface.h +++ b/include/maxwell_interface.h @@ -281,7 +281,13 @@ struct solver_state_gpu field_gpu bndsrcs; field_gpu bndsrcs_buf; + + field bndsrcs_decomp_cpu; pinned_field bndsrcs_cpu; + + std::vector<size_t> bndsrcs_decomp_table_cpu; + device_vector<size_t> bndsrcs_decomp_table; + stream memcpy_stream; stream compute_stream; }; @@ -384,4 +390,7 @@ void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&, void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&, const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); +void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, + field_gpu& srcs); + } // namespace maxwell diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp index 48c6dcd37791e3a9effd8e1101c123c04863da04..bf432a50f43ac6f5e1e486ce6056e788275cd491 100644 --- a/src/maxwell_gpu.cpp +++ b/src/maxwell_gpu.cpp @@ -80,8 +80,7 @@ void init_from_model(const model& mod, solver_state_gpu& state) state.tmp.resize( mod.num_dofs() ); state.bndsrcs.resize( mod.num_fluxes() ); - state.bndsrcs_buf.resize( mod.num_fluxes() ); - state.bndsrcs_cpu.resize( mod.num_fluxes() ); + state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() ); for (auto& e : mod) { @@ -92,10 +91,53 @@ void init_from_model(const model& mod, solver_state_gpu& state) state.edgs.push_back( std::move(edg) ); } + auto& bds = mod.boundary_descriptors(); + size_t face_num_base = 0; + for (auto& e : mod) + { + for (size_t iF = 0; iF < e.num_faces(); iF++) + { + auto& pf = e.face(iF); + auto& rf = e.face_refelem(pf); + auto num_fluxes = rf.num_basis_functions(); + + auto gfnum = face_num_base + iF; + auto bd = bds[gfnum]; + if (bd.type == face_type::BOUNDARY or bd.type == face_type::INTERFACE) + { + for (size_t i = 0; i < num_fluxes; i++) + { + state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i); + } + } + } + face_num_base += e.num_faces(); + } + + state.bndsrcs_decomp_table.copyin( state.bndsrcs_decomp_table_cpu.data(), + state.bndsrcs_decomp_table_cpu.size() ); + + state.bndsrcs_cpu.resize( state.bndsrcs_decomp_table_cpu.size() ); + state.bndsrcs_buf.resize( state.bndsrcs_decomp_table_cpu.size() ); + state.curr_time = 0.0; state.curr_timestep = 0; } +void +compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& csrcs) +{ + for (size_t i = 0; i < csrcs.num_dofs; i++) + { + auto from_ofs = state.bndsrcs_decomp_table_cpu[i]; + csrcs.Ex[i] = srcs.Ex[from_ofs]; + csrcs.Ey[i] = srcs.Ey[from_ofs]; + csrcs.Ez[i] = srcs.Ez[from_ofs]; + csrcs.Hx[i] = srcs.Hx[from_ofs]; + csrcs.Hy[i] = srcs.Hy[from_ofs]; + csrcs.Hz[i] = srcs.Hz[from_ofs]; + } +} static void compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) @@ -373,14 +415,15 @@ prepare_sources(const model& mod, maxwell::solver_state_gpu& state, maxwell::parameter_loader& mpl) { if ( mpl.boundary_sources_enabled() ) - maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu); + maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); if ( mpl.interface_sources_enabled() ) - maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu); + maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); + compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu); state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream); state.memcpy_stream.wait(); - std::swap(state.bndsrcs, state.bndsrcs_buf); + decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); } void @@ -389,6 +432,7 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, { if ( mpl.source_has_changed_state() ) { + state.bndsrcs_decomp_cpu.zero(); state.bndsrcs_cpu.zero(); mpl.source_was_cleared(); } @@ -398,20 +442,23 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, auto ve = mpl.volume_sources_enabled(); if ( be ) - maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu); + maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); if ( ie ) - maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu); + maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); if ( be or ie or ve ) + { + compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu); state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream); + } } void swap(maxwell::solver_state_gpu& state) { checkGPU( gpuDeviceSynchronize() ); - std::swap(state.bndsrcs, state.bndsrcs_buf); + decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); std::swap(state.emf_curr, state.emf_next); } diff --git a/src/maxwell_kernels_cuda.cu b/src/maxwell_kernels_cuda.cu index 91999ca2df1eebacfbf43e9d8f630d4c190ef047..7365e927a56e267686e42e6b4c59ffbd8c2ad7c7 100644 --- a/src/maxwell_kernels_cuda.cu +++ b/src/maxwell_kernels_cuda.cu @@ -400,4 +400,38 @@ gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps, } } +__global__ void +gpu_bndsrcs_decompress_kernel(const size_t *dtable, const field_gpu::const_raw_ptrs csrcs, + field_gpu::raw_ptrs srcs) +{ + auto cdof = blockIdx.x * blockDim.x + threadIdx.x; + + if (cdof >= csrcs.num_dofs) + return; + + auto ddof = dtable[cdof]; + srcs.Ex[ddof] = csrcs.Ex[cdof]; + srcs.Ey[ddof] = csrcs.Ey[cdof]; + srcs.Ez[ddof] = csrcs.Ez[cdof]; + srcs.Hx[ddof] = csrcs.Hx[cdof]; + srcs.Hy[ddof] = csrcs.Hy[cdof]; + srcs.Hz[ddof] = csrcs.Hz[cdof]; +} + +void +decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, field_gpu& srcs) +{ + static const size_t DECOMPRESS_THREADS = 256; + + auto num_cdofs = csrcs.num_dofs; + auto gs = num_cdofs/DECOMPRESS_THREADS; + if (num_cdofs % DECOMPRESS_THREADS != 0) + gs += 1; + + dim3 grid_size(gs); + + gpu_bndsrcs_decompress_kernel<<<gs, DECOMPRESS_THREADS, 0, state.compute_stream>>>( + state.bndsrcs_decomp_table.data(), csrcs.data(), srcs.data()); +} + } // namespace maxwell