diff --git a/include/entity.h b/include/entity.h index 70df6223d9fc821190368363cd348e6508f37c76..7dfabde7bb58e55bca182e5705c760e185ac727a 100644 --- a/include/entity.h +++ b/include/entity.h @@ -65,6 +65,7 @@ public: //matxd project(const vector_function&) const; void project(const scalar_function&, vecxd&) const; void project(const vector_function&, vecxd&, vecxd&, vecxd&) const; + void project(const vector_function&, double *, double *, double *) const; matxd project_on_face(size_t iF, const vector_function&) const; diff --git a/include/gpu_support.hpp b/include/gpu_support.hpp index 475a962528f0c2050f8abdb1002ddaf2ee707d5c..d4502c2fb748a6aa9cc0874396aa26e2220f196f 100644 --- a/include/gpu_support.hpp +++ b/include/gpu_support.hpp @@ -199,6 +199,74 @@ public: } }; +template<typename T> +class pinned_vector +{ + T * m_ptr; + size_t m_size; + +public: + pinned_vector() + : m_ptr(nullptr), m_size(0) + {} + + pinned_vector(size_t size) + : m_ptr(nullptr), m_size(0) + { + (void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) ); + memset(m_ptr, '\0', size*sizeof(double)); + m_size = size; + } + + pinned_vector(const pinned_vector&) = delete; + pinned_vector(pinned_vector&&) = delete; + pinned_vector& operator=(const pinned_vector&) = delete; + pinned_vector& operator=(pinned_vector&&) = delete; + + void resize(size_t size) + { + if (size == m_size) + return; + + if (m_ptr != nullptr) + (void) checkGPU(gpuFreeHost(m_ptr)); + + (void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) ); + memset(m_ptr, '\0', size*sizeof(double)); + m_size = size; + } + + void zero(void) + { + memset(m_ptr, '\0', m_size*sizeof(double)); + } + + size_t size(void) const + { + return m_size; + } + + T* data(void) + { + return m_ptr; + } + + const T* data(void) const + { + return m_ptr; + } + + T operator[](size_t pos) const + { + return m_ptr[pos]; + } + + T& operator[](size_t pos) + { + return m_ptr[pos]; + } +}; + template<typename T> class device_vector { @@ -255,6 +323,15 @@ public: (void)checkGPU( gpuMemcpyAsync(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice, st) ); } + void copyin(const T *src, size_t start, size_t num, const stream& st) + { + assert(start < m_size); + assert(start+num <= m_size); + const T *c_begin = src + start; + T *d_begin = m_devptr + start; + (void)checkGPU( gpuMemcpyAsync(d_begin, c_begin, num*sizeof(T), gpuMemcpyHostToDevice, st) ); + } + void copyout(T *dst) const { if (m_devptr == nullptr) diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index 454c1df8d8c59a4ed08fa2d0040c3de779dd242b..c4732e6fa0e4ce9aae44306b33b9c257f6d3222f 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -148,9 +148,12 @@ gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F, void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *, gpuStream_t stream = 0); -void gpu_subtract(double *, const double *, const double *, size_t, +void gpu_curl(double *, const double *, const double *, size_t, gpuStream_t stream = 0); +void gpu_curl(double *, const double *, const double *, const double *, + size_t, gpuStream_t stream = 0); + void gpu_compute_euler_update(double *out, const double *y, const double *k, const double *reaction, const double *material, size_t num_dofs, double dt, diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h index aec2ef6a834fc8305b2d9e23aa05b82bda26d1b6..8277b482fa9a55da8a26192f457e54697b5465d8 100644 --- a/include/maxwell_interface.h +++ b/include/maxwell_interface.h @@ -65,6 +65,8 @@ public: bool initial_Hfield_defined(void) const; vec3d initial_Hfield(const point_3d&) const; + bool volume_has_source(int) const; + vec3d eval_volume_source(int, const point_3d&, double) const; vec3d eval_boundary_source(int, const point_3d&, double) const; vec3d eval_interface_source(int, const point_3d&, double) const; @@ -250,6 +252,9 @@ struct solver_state field k4; field tmp; + vecxd Jx_src; + vecxd Jy_src; + vecxd Jz_src; field bndsrcs; }; @@ -279,6 +284,17 @@ struct solver_state_gpu field_gpu k4; field_gpu tmp; + + pinned_vector<double> Jx_src; + pinned_vector<double> Jy_src; + pinned_vector<double> Jz_src; + device_vector<double> Jx_src_gpu; + device_vector<double> Jy_src_gpu; + device_vector<double> Jz_src_gpu; + device_vector<double> Jx_src_gpu_buf; + device_vector<double> Jy_src_gpu_buf; + device_vector<double> Jz_src_gpu_buf; + field_gpu bndsrcs; field_gpu bndsrcs_buf; diff --git a/src/entity.cpp b/src/entity.cpp index 30e33790cf7ff868d8f035845ff4f13acfb392db..4eb1f2372a4861328691fccded045157ae80a1bd 100644 --- a/src/entity.cpp +++ b/src/entity.cpp @@ -166,6 +166,51 @@ entity::project(const vector_function& function, vecxd& pfx, vecxd& pfy, vecxd& } } +void +entity::project(const vector_function& function, double *pfx, double *pfy, double *pfz) const +{ + auto num_bf = reference_cells[0].num_basis_functions(); + + for (size_t iT = 0; iT < physical_cells.size(); iT++) + { + const auto& pe = physical_cells[iT]; + assert( pe.orientation() < reference_cells.size() ); + const auto& re = reference_cells[pe.orientation()]; + const auto pqps = pe.integration_points(); + const auto dets = pe.determinants(); + + assert(pqps.size() == dets.size()); + + matxd mass = matxd::Zero(num_bf, num_bf); + vecxd rhs_x = vecxd::Zero(num_bf); + vecxd rhs_y = vecxd::Zero(num_bf); + vecxd rhs_z = vecxd::Zero(num_bf); + for (size_t iQp = 0; iQp < pqps.size(); iQp++) + { + const auto& pqp = pqps[iQp]; + const vecxd phi = re.basis_functions(iQp); + mass += pqp.weight() * phi * phi.transpose(); + vec3d fval = function(pqp.point()); + rhs_x += pqp.weight() * fval(0) * phi; + rhs_y += pqp.weight() * fval(1) * phi; + rhs_z += pqp.weight() * fval(2) * phi; + } + + Eigen::LDLT<matxd> mass_ldlt; + mass_ldlt.compute(mass); + vecxd p_x = mass_ldlt.solve(rhs_x); + vecxd p_y = mass_ldlt.solve(rhs_y); + vecxd p_z = mass_ldlt.solve(rhs_z); + + for (size_t i = 0; i < num_bf; i++) + { + pfx[m_dof_base + iT*num_bf + i] = p_x[i]; + pfy[m_dof_base + iT*num_bf + i] = p_y[i]; + pfz[m_dof_base + iT*num_bf + i] = p_z[i]; + } + } +} + matxd entity::project_on_face(size_t iF, const vector_function& function) const { diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index 99918c51133c664c5f5bc8f1f0dd4dd4ef9dd72f..2c88604fc3818c5dc1cd3c66e3055529034aec4f 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -292,7 +292,7 @@ gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out, } void __global__ -gpu_subtract_kernel(double * __restrict__ dst, const double * __restrict__ add, +gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add, const double * __restrict__ sub, size_t num_dofs) { int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; @@ -301,7 +301,7 @@ gpu_subtract_kernel(double * __restrict__ dst, const double * __restrict__ add, } void -gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs, +gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs, cudaStream_t stream) { static const size_t SUBTRACT_THREADS = 256; @@ -309,7 +309,29 @@ gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs, if (num_dofs % SUBTRACT_THREADS != 0) gs += 1; - gpu_subtract_kernel<<<gs, SUBTRACT_THREADS, 0, stream>>>(dst, add, sub, num_dofs); + gpu_curl_kernel<<<gs, SUBTRACT_THREADS, 0, stream>>>(dst, add, sub, num_dofs); +} + +void __global__ +gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add, + const double * __restrict__ sub, const double * __restrict__ source, + size_t num_dofs) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + dst[dof] = add[dof] - sub[dof] - source[dof]; +} + +void +gpu_curl(double *dst, const double *add, const double *sub, const double *source, + size_t num_dofs, cudaStream_t stream) +{ + static const size_t SUBTRACT_THREADS = 256; + auto gs = num_dofs/SUBTRACT_THREADS; + if (num_dofs % SUBTRACT_THREADS != 0) + gs += 1; + + gpu_curl_kernel<<<gs, SUBTRACT_THREADS, 0, stream>>>(dst, add, sub, source, num_dofs); } diff --git a/src/maxwell_cpu.cpp b/src/maxwell_cpu.cpp index a5a13a54949a64bf9298d694bc045d03b5a9cb48..7834684b72a9a442e0caefdf061a1965a3f427c8 100644 --- a/src/maxwell_cpu.cpp +++ b/src/maxwell_cpu.cpp @@ -71,6 +71,9 @@ init_from_model(const model& mod, solver_state& state) state.jumps.resize( mod.num_fluxes() ); state.fluxes.resize( mod.num_fluxes() ); + state.Jx_src = vecxd::Zero( mod.num_dofs() ); + state.Jy_src = vecxd::Zero( mod.num_dofs() ); + state.Jz_src = vecxd::Zero( mod.num_dofs() ); state.bndsrcs.resize( mod.num_fluxes() ); for (auto& e : mod) @@ -97,9 +100,9 @@ compute_curls(solver_state& state, const field& curr, field& next) compute_field_derivatives(ed, curr.Hz, state.dx.Hz, state.dy.Hz, state.dz.Hz); } - next.Ex = state.dy.Hz - state.dz.Hy; - next.Ey = state.dz.Hx - state.dx.Hz; - next.Ez = state.dx.Hy - state.dy.Hx; + next.Ex = state.dy.Hz - state.dz.Hy - state.Jx_src; + next.Ey = state.dz.Hx - state.dx.Hz - state.Jy_src; + next.Ez = state.dx.Hy - state.dy.Hx - state.Jz_src; next.Hx = state.dz.Ey - state.dy.Ez; next.Hy = state.dx.Ez - state.dz.Ex; next.Hz = state.dy.Ex - state.dx.Ey; @@ -130,9 +133,9 @@ compute_curls_H(solver_state& state, const field& curr, field& next) compute_field_derivatives(ed, curr.Hz, state.dx.Hz, state.dy.Hz, state.dz.Hz); } - next.Ex = state.dy.Hz - state.dz.Hy; - next.Ey = state.dz.Hx - state.dx.Hz; - next.Ez = state.dx.Hy - state.dy.Hx; + next.Ex = state.dy.Hz - state.dz.Hy - state.Jx_src; + next.Ey = state.dz.Hx - state.dx.Hz - state.Jy_src; + next.Ez = state.dx.Hy - state.dy.Hx - state.Jz_src; } @@ -536,20 +539,42 @@ timestep(solver_state& state, time_integrator_type ti) } void -do_sources(const model& mod, maxwell::solver_state& state, - maxwell::parameter_loader& mpl) +eval_volume_sources(const model& mod, const parameter_loader& mpl, solver_state& state) +{ + for (auto& e : mod) + { + auto etag = e.gmsh_tag(); + if ( not mpl.volume_has_source(etag) ) + continue; + + auto fJ = [&](const point_3d& pt) -> vec3d { + return mpl.eval_volume_source(etag, pt, state.curr_time); + }; + + e.project(fJ, state.Jx_src, state.Jy_src, state.Jz_src); + } +} + +void +do_sources(const model& mod, solver_state& state, parameter_loader& mpl) { if ( mpl.source_has_changed_state() ) { + state.Jx_src = vecxd::Zero( state.Jx_src.size() ); + state.Jy_src = vecxd::Zero( state.Jy_src.size() ); + state.Jz_src = vecxd::Zero( state.Jz_src.size() ); state.bndsrcs.zero(); mpl.source_was_cleared(); } + if ( mpl.volume_sources_enabled() ) + eval_volume_sources(mod, mpl, state); + if ( mpl.boundary_sources_enabled() ) - maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs); + eval_boundary_sources(mod, mpl, state, state.bndsrcs); if ( mpl.interface_sources_enabled() ) - maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs); + eval_interface_sources(mod, mpl, state, state.bndsrcs); } void diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp index bf432a50f43ac6f5e1e486ce6056e788275cd491..4fd145f388546d42f74b12d9f649255516e519c7 100644 --- a/src/maxwell_gpu.cpp +++ b/src/maxwell_gpu.cpp @@ -79,6 +79,16 @@ void init_from_model(const model& mod, solver_state_gpu& state) state.k4.resize( mod.num_dofs() ); state.tmp.resize( mod.num_dofs() ); + state.Jx_src.resize( mod.num_dofs() ); + state.Jy_src.resize( mod.num_dofs() ); + state.Jz_src.resize( mod.num_dofs() ); + state.Jx_src_gpu.resize( mod.num_dofs() ); + state.Jy_src_gpu.resize( mod.num_dofs() ); + state.Jz_src_gpu.resize( mod.num_dofs() ); + state.Jx_src_gpu_buf.resize( mod.num_dofs() ); + state.Jy_src_gpu_buf.resize( mod.num_dofs() ); + state.Jz_src_gpu_buf.resize( mod.num_dofs() ); + state.bndsrcs.resize( mod.num_fluxes() ); state.bndsrcs_decomp_cpu.resize( mod.num_fluxes() ); @@ -158,13 +168,16 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream); } + auto Jx = state.Jx_src_gpu.data(); + auto Jy = state.Jy_src_gpu.data(); + auto Jz = state.Jz_src_gpu.data(); - gpu_subtract(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); } static void @@ -184,9 +197,9 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) } - gpu_subtract(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); } static void @@ -205,10 +218,13 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream); } - - gpu_subtract(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream); - gpu_subtract(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream); + auto Jx = state.Jx_src_gpu.data(); + auto Jy = state.Jy_src_gpu.data(); + auto Jz = state.Jz_src_gpu.data(); + + gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); + gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); } static void @@ -426,12 +442,47 @@ prepare_sources(const model& mod, maxwell::solver_state_gpu& state, decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); } +static void +copyin_volume_sources(const entity& e, solver_state_gpu& state) +{ + auto ofs = e.dof_base(); + auto num = e.num_dofs(); + + state.Jx_src_gpu_buf.copyin( state.Jx_src.data(), ofs, num, state.memcpy_stream ); + state.Jy_src_gpu_buf.copyin( state.Jy_src.data(), ofs, num, state.memcpy_stream ); + state.Jz_src_gpu_buf.copyin( state.Jz_src.data(), ofs, num, state.memcpy_stream ); +} + +static void +eval_volume_sources(const model& mod, const parameter_loader& mpl, solver_state_gpu& state) +{ + for (auto& e : mod) + { + auto etag = e.gmsh_tag(); + if ( not mpl.volume_has_source(etag) ) + continue; + + auto fJ = [&](const point_3d& pt) -> vec3d { + return mpl.eval_volume_source(etag, pt, state.curr_time); + }; + + e.project(fJ, state.Jx_src.data(), state.Jy_src.data(), state.Jz_src.data()); + copyin_volume_sources(e, state); + } +} + void do_sources(const model& mod, maxwell::solver_state_gpu& state, maxwell::parameter_loader& mpl) { if ( mpl.source_has_changed_state() ) { + state.Jx_src.zero(); + state.Jy_src.zero(); + state.Jz_src.zero(); + state.Jx_src_gpu.zero(); + state.Jy_src_gpu.zero(); + state.Jz_src_gpu.zero(); state.bndsrcs_decomp_cpu.zero(); state.bndsrcs_cpu.zero(); mpl.source_was_cleared(); @@ -441,11 +492,15 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, auto ie = mpl.interface_sources_enabled(); auto ve = mpl.volume_sources_enabled(); + + if ( ve ) + eval_volume_sources(mod, mpl, state); + if ( be ) - maxwell::eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); + eval_boundary_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); if ( ie ) - maxwell::eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); + eval_interface_sources(mod, mpl, state, state.bndsrcs_decomp_cpu); if ( be or ie or ve ) { @@ -458,6 +513,9 @@ void swap(maxwell::solver_state_gpu& state) { checkGPU( gpuDeviceSynchronize() ); + std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf); + std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf); + std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf); decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs); std::swap(state.emf_curr, state.emf_next); } diff --git a/src/maxwell_interface.cpp b/src/maxwell_interface.cpp index eebd90025687f00b7eb71af2ad15131d6944333c..5e5e6555861905f8a00182bc097b90cf35bd0408 100644 --- a/src/maxwell_interface.cpp +++ b/src/maxwell_interface.cpp @@ -237,7 +237,7 @@ material_params_gpu::copyin(const material_params& mp) - +#define SEC_VSRCS "sources" #define SEC_BCONDS "bndconds" #define BC_KIND "kind" #define BC_SOURCE "source" @@ -277,6 +277,7 @@ parameter_loader::parameter_loader() lua["postpro"]["H"]["silo_mode"] = "nodal"; lua["postpro"]["J"] = lua.create_table(); lua["postpro"]["J"]["silo_mode"] = "nodal"; + lua[SEC_VSRCS] = lua.create_table(); } bool @@ -559,6 +560,22 @@ parameter_loader::interface_type(int tag) const return interface_condition::UNSPECIFIED; } +bool +parameter_loader::volume_has_source(int tag) const +{ + auto src = lua[SEC_VSRCS][tag]; + return src.valid(); +} + +vec3d +parameter_loader::eval_volume_source(int tag, const point_3d& pt, double t) const +{ + vec3d ret; + sol::tie(ret(0), ret(1), ret(2)) = + lua[SEC_VSRCS][tag](tag, pt.x(), pt.y(), pt.z(), t); + return ret; +} + vec3d parameter_loader::eval_boundary_source(int tag, const point_3d& pt, double t) const {