Skip to content
Snippets Groups Projects
Commit 044712a8 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Implemented boundary source compression for copies to GPU.

parent bfde71f6
No related branches found
No related tags found
No related merge requests found
...@@ -281,7 +281,13 @@ struct solver_state_gpu ...@@ -281,7 +281,13 @@ struct solver_state_gpu
field_gpu bndsrcs; field_gpu bndsrcs;
field_gpu bndsrcs_buf; field_gpu bndsrcs_buf;
field bndsrcs_decomp_cpu;
pinned_field bndsrcs_cpu; pinned_field bndsrcs_cpu;
std::vector<size_t> bndsrcs_decomp_table_cpu;
device_vector<size_t> bndsrcs_decomp_table;
stream memcpy_stream; stream memcpy_stream;
stream compute_stream; stream compute_stream;
}; };
...@@ -384,4 +390,7 @@ void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&, ...@@ -384,4 +390,7 @@ void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&, void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs,
field_gpu& srcs);
} // namespace maxwell } // namespace maxwell
...@@ -80,8 +80,7 @@ void init_from_model(const model& mod, solver_state_gpu& state) ...@@ -80,8 +80,7 @@ void init_from_model(const model& mod, solver_state_gpu& state)
state.tmp.resize( mod.num_dofs() ); state.tmp.resize( mod.num_dofs() );
state.bndsrcs.resize( mod.num_fluxes() ); state.bndsrcs.resize( mod.num_fluxes() );
state.bndsrcs_buf.resize( mod.num_fluxes() ); state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() );
state.bndsrcs_cpu.resize( mod.num_fluxes() );
for (auto& e : mod) for (auto& e : mod)
{ {
...@@ -92,10 +91,53 @@ void init_from_model(const model& mod, solver_state_gpu& state) ...@@ -92,10 +91,53 @@ void init_from_model(const model& mod, solver_state_gpu& state)
state.edgs.push_back( std::move(edg) ); state.edgs.push_back( std::move(edg) );
} }
auto& bds = mod.boundary_descriptors();
size_t face_num_base = 0;
for (auto& e : mod)
{
for (size_t iF = 0; iF < e.num_faces(); iF++)
{
auto& pf = e.face(iF);
auto& rf = e.face_refelem(pf);
auto num_fluxes = rf.num_basis_functions();
auto gfnum = face_num_base + iF;
auto bd = bds[gfnum];
if (bd.type == face_type::BOUNDARY or bd.type == face_type::INTERFACE)
{
for (size_t i = 0; i < num_fluxes; i++)
{
state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
}
}
}
face_num_base += e.num_faces();
}
state.bndsrcs_decomp_table.copyin( state.bndsrcs_decomp_table_cpu.data(),
state.bndsrcs_decomp_table_cpu.size() );
state.bndsrcs_cpu.resize( state.bndsrcs_decomp_table_cpu.size() );
state.bndsrcs_buf.resize( state.bndsrcs_decomp_table_cpu.size() );
state.curr_time = 0.0; state.curr_time = 0.0;
state.curr_timestep = 0; state.curr_timestep = 0;
} }
void
compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& csrcs)
{
for (size_t i = 0; i < csrcs.num_dofs; i++)
{
auto from_ofs = state.bndsrcs_decomp_table_cpu[i];
csrcs.Ex[i] = srcs.Ex[from_ofs];
csrcs.Ey[i] = srcs.Ey[from_ofs];
csrcs.Ez[i] = srcs.Ez[from_ofs];
csrcs.Hx[i] = srcs.Hx[from_ofs];
csrcs.Hy[i] = srcs.Hy[from_ofs];
csrcs.Hz[i] = srcs.Hz[from_ofs];
}
}
static void static void
compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
...@@ -373,14 +415,15 @@ prepare_sources(const model& mod, maxwell::solver_state_gpu& state, ...@@ -373,14 +415,15 @@ prepare_sources(const model& mod, maxwell::solver_state_gpu& state,
maxwell::parameter_loader& mpl) maxwell::parameter_loader& mpl)
{ {
if ( mpl.boundary_sources_enabled() ) if ( mpl.boundary_sources_enabled() )
maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu); maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
if ( mpl.interface_sources_enabled() ) if ( mpl.interface_sources_enabled() )
maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu); maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream); state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
state.memcpy_stream.wait(); state.memcpy_stream.wait();
std::swap(state.bndsrcs, state.bndsrcs_buf); decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
} }
void void
...@@ -389,6 +432,7 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, ...@@ -389,6 +432,7 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
{ {
if ( mpl.source_has_changed_state() ) if ( mpl.source_has_changed_state() )
{ {
state.bndsrcs_decomp_cpu.zero();
state.bndsrcs_cpu.zero(); state.bndsrcs_cpu.zero();
mpl.source_was_cleared(); mpl.source_was_cleared();
} }
...@@ -398,20 +442,23 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, ...@@ -398,20 +442,23 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
auto ve = mpl.volume_sources_enabled(); auto ve = mpl.volume_sources_enabled();
if ( be ) if ( be )
maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu); maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
if ( ie ) if ( ie )
maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu); maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
if ( be or ie or ve ) if ( be or ie or ve )
{
compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream); state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
} }
}
void void
swap(maxwell::solver_state_gpu& state) swap(maxwell::solver_state_gpu& state)
{ {
checkGPU( gpuDeviceSynchronize() ); checkGPU( gpuDeviceSynchronize() );
std::swap(state.bndsrcs, state.bndsrcs_buf); decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
std::swap(state.emf_curr, state.emf_next); std::swap(state.emf_curr, state.emf_next);
} }
......
...@@ -400,4 +400,38 @@ gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps, ...@@ -400,4 +400,38 @@ gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
} }
} }
__global__ void
gpu_bndsrcs_decompress_kernel(const size_t *dtable, const field_gpu::const_raw_ptrs csrcs,
field_gpu::raw_ptrs srcs)
{
auto cdof = blockIdx.x * blockDim.x + threadIdx.x;
if (cdof >= csrcs.num_dofs)
return;
auto ddof = dtable[cdof];
srcs.Ex[ddof] = csrcs.Ex[cdof];
srcs.Ey[ddof] = csrcs.Ey[cdof];
srcs.Ez[ddof] = csrcs.Ez[cdof];
srcs.Hx[ddof] = csrcs.Hx[cdof];
srcs.Hy[ddof] = csrcs.Hy[cdof];
srcs.Hz[ddof] = csrcs.Hz[cdof];
}
void
decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, field_gpu& srcs)
{
static const size_t DECOMPRESS_THREADS = 256;
auto num_cdofs = csrcs.num_dofs;
auto gs = num_cdofs/DECOMPRESS_THREADS;
if (num_cdofs % DECOMPRESS_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
gpu_bndsrcs_decompress_kernel<<<gs, DECOMPRESS_THREADS, 0, state.compute_stream>>>(
state.bndsrcs_decomp_table.data(), csrcs.data(), srcs.data());
}
} // namespace maxwell } // namespace maxwell
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment