Implemented boundary source compression for copies to GPU.

044712a8 · Matteo Cicuttin · bfde71f6 · 044712a8 · 044712a8 · 044712a8
Commit 044712a8 authored 4 years ago by Matteo Cicuttin
--- a/include/maxwell_interface.h
+++ b/include/maxwell_interface.h
@@ -281,7 +281,13 @@ struct solver_state_gpu
    field_gpu                           bndsrcs;
    field_gpu                           bndsrcs_buf;
+    field                               bndsrcs_decomp_cpu;
    pinned_field                        bndsrcs_cpu;
+    std::vector<size_t>                 bndsrcs_decomp_table_cpu;
+    device_vector<size_t>               bndsrcs_decomp_table;
    stream                              memcpy_stream;
    stream                              compute_stream;
 };
@@ -384,4 +390,7 @@ void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
 void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
    const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0);
+void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs,
+    field_gpu& srcs);
 } // namespace maxwell
--- a/src/maxwell_gpu.cpp
+++ b/src/maxwell_gpu.cpp
@@ -80,8 +80,7 @@ void init_from_model(const model& mod, solver_state_gpu& state)
    state.tmp.resize( mod.num_dofs() );
    state.bndsrcs.resize( mod.num_fluxes() );
-    state.bndsrcs_buf.resize( mod.num_fluxes() );
+    state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() );
-    state.bndsrcs_cpu.resize( mod.num_fluxes() );
    for (auto& e : mod)
    {
@@ -92,10 +91,53 @@ void init_from_model(const model& mod, solver_state_gpu& state)
        state.edgs.push_back( std::move(edg) );
    }
+    auto& bds = mod.boundary_descriptors();
+    size_t face_num_base = 0;
+    for (auto& e : mod)
+    {
+        for (size_t iF = 0; iF < e.num_faces(); iF++)
+        {
+            auto& pf = e.face(iF);
+            auto& rf = e.face_refelem(pf);
+            auto num_fluxes = rf.num_basis_functions();
+            auto gfnum = face_num_base + iF;
+            auto bd = bds[gfnum];
+            if (bd.type == face_type::BOUNDARY or bd.type == face_type::INTERFACE)
+            {
+                for (size_t i = 0; i < num_fluxes; i++)
+                {
+                    state.bndsrcs_decomp_table_cpu.push_back(gfnum*num_fluxes+i);
+                }
+            }
+        }
+        face_num_base += e.num_faces();
+    }
+    state.bndsrcs_decomp_table.copyin( state.bndsrcs_decomp_table_cpu.data(),
+        state.bndsrcs_decomp_table_cpu.size() );
+    state.bndsrcs_cpu.resize( state.bndsrcs_decomp_table_cpu.size() );
+    state.bndsrcs_buf.resize( state.bndsrcs_decomp_table_cpu.size() );
    state.curr_time = 0.0;
    state.curr_timestep = 0;
 }
+void
+compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& csrcs)
+{
+    for (size_t i = 0; i < csrcs.num_dofs; i++)
+    {
+        auto from_ofs = state.bndsrcs_decomp_table_cpu[i];
+        csrcs.Ex[i] = srcs.Ex[from_ofs]; 
+        csrcs.Ey[i] = srcs.Ey[from_ofs]; 
+        csrcs.Ez[i] = srcs.Ez[from_ofs]; 
+        csrcs.Hx[i] = srcs.Hx[from_ofs]; 
+        csrcs.Hy[i] = srcs.Hy[from_ofs]; 
+        csrcs.Hz[i] = srcs.Hz[from_ofs]; 
+    }
+}
 static void
 compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
@@ -373,14 +415,15 @@ prepare_sources(const model& mod, maxwell::solver_state_gpu& state,
    maxwell::parameter_loader& mpl)
 {
    if ( mpl.boundary_sources_enabled() )
-        maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu);
+        maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
    if ( mpl.interface_sources_enabled() )
-        maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu);
+        maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
+    compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
    state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
    state.memcpy_stream.wait();
-    std::swap(state.bndsrcs, state.bndsrcs_buf);
+    decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
 }
 void
@@ -389,6 +432,7 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
 {
    if ( mpl.source_has_changed_state() )
    {
+        state.bndsrcs_decomp_cpu.zero();
        state.bndsrcs_cpu.zero();
        mpl.source_was_cleared();
    }
@@ -398,20 +442,23 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
    auto ve = mpl.volume_sources_enabled();
    if ( be )
-        maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_cpu);
+        maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
    if ( ie )
-        maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_cpu);
+        maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu);
    if ( be or ie or ve )
+    {
+        compress_bndsrc(state, state.bndsrcs_decomp_cpu, state.bndsrcs_cpu);
        state.bndsrcs_buf.copyin(state.bndsrcs_cpu, state.memcpy_stream);
    }
+}
 void
 swap(maxwell::solver_state_gpu& state)
 {
    checkGPU( gpuDeviceSynchronize() );
-    std::swap(state.bndsrcs, state.bndsrcs_buf);
+    decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
    std::swap(state.emf_curr, state.emf_next);
 }

--- a/src/maxwell_kernels_cuda.cu
+++ b/src/maxwell_kernels_cuda.cu
@@ -400,4 +400,38 @@ gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
    }
 }
+__global__ void
+gpu_bndsrcs_decompress_kernel(const size_t *dtable, const field_gpu::const_raw_ptrs csrcs,
+    field_gpu::raw_ptrs srcs)
+{
+    auto cdof = blockIdx.x * blockDim.x + threadIdx.x;
+    if (cdof >= csrcs.num_dofs)
+        return;
+    auto ddof = dtable[cdof];
+    srcs.Ex[ddof] = csrcs.Ex[cdof]; 
+    srcs.Ey[ddof] = csrcs.Ey[cdof]; 
+    srcs.Ez[ddof] = csrcs.Ez[cdof]; 
+    srcs.Hx[ddof] = csrcs.Hx[cdof]; 
+    srcs.Hy[ddof] = csrcs.Hy[cdof]; 
+    srcs.Hz[ddof] = csrcs.Hz[cdof]; 
+}
+void
+decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, field_gpu& srcs)
+{
+    static const size_t DECOMPRESS_THREADS = 256;
+    auto num_cdofs = csrcs.num_dofs;
+    auto gs = num_cdofs/DECOMPRESS_THREADS;
+    if (num_cdofs % DECOMPRESS_THREADS != 0)
+        gs += 1;
+    dim3 grid_size(gs);
+    gpu_bndsrcs_decompress_kernel<<<gs, DECOMPRESS_THREADS, 0, state.compute_stream>>>(
+        state.bndsrcs_decomp_table.data(), csrcs.data(), srcs.data());
+}
 } // namespace maxwell