diff --git a/include/gpu_support.hpp b/include/gpu_support.hpp index 4b9ca038166beafaf6a836d7ad2ed288812dd40e..e802a75f79fb981407e23ce98a8d4c1cabd13587 100644 --- a/include/gpu_support.hpp +++ b/include/gpu_support.hpp @@ -225,6 +225,7 @@ public: (void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) ); m_size = other.m_size; + return *this; } void zero(void) diff --git a/include/kernels_gpu.h b/include/kernels_gpu.h index 1c81c6d51719278d8f16cea65646973cb69b006f..b61d926e3fbf1bb3c220139f51a79f10c1c27d19 100644 --- a/include/kernels_gpu.h +++ b/include/kernels_gpu.h @@ -148,4 +148,22 @@ gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F, void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *); -void gpu_subtract(double *, const double *, const double *, size_t); \ No newline at end of file +void gpu_subtract(double *, const double *, const double *, size_t); + +void +gpu_compute_euler_update(double *out, const double *y, const double *k, + const double *reaction, const double *material, size_t num_dofs, double dt); + +void +gpu_compute_euler_update(double *out, const double *y, const double *k, + const double *material, size_t num_dofs, double dt); + +void +gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, + const double *k2, const double *k3, const double *k4, + const double *material, size_t num_dofs, double dt); + +void +gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, + const double *k2, const double *k3, const double *k4, const double *reaction, + const double *material, size_t num_dofs, double dt); \ No newline at end of file diff --git a/include/maxwell_interface.h b/include/maxwell_interface.h index f946095317b53bbf6f8e54f140222b4848e547c2..c547175dba201f35209de567353967239fddeda9 100644 --- a/include/maxwell_interface.h +++ b/include/maxwell_interface.h @@ -163,6 +163,12 @@ struct solver_state_gpu field_gpu dx; field_gpu dy; field_gpu dz; + + field_gpu k1; + field_gpu k2; + field_gpu k3; + field_gpu k4; + field_gpu tmp; }; void init_from_model(const model&, solver_state&); diff --git a/src/kernels_cuda.cu b/src/kernels_cuda.cu index da11465929dad3538f57be27107f8072254f8461..4591446a57f5426b44c01ccd5d4fd7547959b3f6 100644 --- a/src/kernels_cuda.cu +++ b/src/kernels_cuda.cu @@ -306,4 +306,125 @@ gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs) gs += 1; gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs); +} + + +void __global__ +gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y, + const double * __restrict__ k, size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = y[dof] + dt*k[dof]; +} + +void __global__ +gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y, + const double * __restrict__ k, const double * __restrict__ material, + size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = y[dof] + dt*k[dof]*material[dof]; +} + +void +gpu_compute_euler_update(double *out, const double *y, const double *k, + const double *material, size_t num_dofs, double dt) +{ + static const size_t EULER_THREADS = 256; + + auto gs = num_dofs/EULER_THREADS; + if (num_dofs % EULER_THREADS != 0) + gs += 1; + + gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, material, num_dofs, dt); +} + +void __global__ +gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y, + const double * __restrict__ k, const double * __restrict__ reaction, + const double * __restrict__ material, size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = y[dof]*(1-dt*reaction[dof]) + dt*k[dof]*material[dof]; +} + +void +gpu_compute_euler_update(double *out, const double *y, const double *k, + const double *reaction, const double *material, size_t num_dofs, double dt) +{ + static const size_t EULER_THREADS = 256; + + auto gs = num_dofs/EULER_THREADS; + if (num_dofs % EULER_THREADS != 0) + gs += 1; + + gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, reaction, material, num_dofs, dt); +} + +void __global__ +gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in, + const double * __restrict__ k1, const double * __restrict__ k2, + const double * __restrict__ k3, const double * __restrict__ k4, + size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = in[dof] + dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6; +} + +void __global__ +gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in, + const double * __restrict__ k1, const double * __restrict__ k2, + const double * __restrict__ k3, const double * __restrict__ k4, + const double * __restrict__ material, + size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = in[dof] + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6; +} + +void +gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, + const double *k2, const double *k3, const double *k4, + const double *material, size_t num_dofs, double dt) +{ + static const size_t RK4_THREADS = 256; + + auto gs = num_dofs/RK4_THREADS; + if (num_dofs % RK4_THREADS != 0) + gs += 1; + + gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4, + material, num_dofs, dt); +} + +void __global__ +gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in, + const double * __restrict__ k1, const double * __restrict__ k2, + const double * __restrict__ k3, const double * __restrict__ k4, + const double * __restrict__ reaction, const double * __restrict__ material, + size_t num_dofs, double dt) +{ + int32_t dof = blockIdx.x * blockDim.x + threadIdx.x; + if (dof < num_dofs) + out[dof] = in[dof]*(1-dt*reaction[dof]) + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6; +} + +void +gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1, + const double *k2, const double *k3, const double *k4, const double *reaction, + const double *material, size_t num_dofs, double dt) +{ + static const size_t RK4_THREADS = 256; + + auto gs = num_dofs/RK4_THREADS; + if (num_dofs % RK4_THREADS != 0) + gs += 1; + + gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4, + reaction, material, num_dofs, dt); } \ No newline at end of file diff --git a/src/maxwell_gpu.cpp b/src/maxwell_gpu.cpp index b53f57e44bbfc213a3b6b4d312f2fbec97fb566e..9b20e65717e9da4dbc4bf628beb815e545fc0c58 100644 --- a/src/maxwell_gpu.cpp +++ b/src/maxwell_gpu.cpp @@ -16,18 +16,33 @@ void init_from_model(const model& mod, solver_state_gpu& state) state.dx.resize( mod.num_dofs() ); state.dy.resize( mod.num_dofs() ); state.dz.resize( mod.num_dofs() ); + state.jumps.resize( mod.num_fluxes() ); state.fluxes.resize( mod.num_fluxes() ); + std::vector<double> zeros( mod.num_dofs(), 0.0 ); std::vector<double> onehalfs( mod.num_fluxes(), 0.5 ); + std::vector<double> ones( mod.num_dofs(), 1.0 ); std::vector<double> twos( mod.num_fluxes(), 2.0 ); + state.matparams.inv_epsilon.copyin( ones.data(), ones.size() ); + state.matparams.inv_mu.copyin( ones.data(), ones.size() ); + state.matparams.sigma.copyin( zeros.data(), zeros.size() ); + state.matparams.sigma_over_epsilon.copyin( zeros.data(), zeros.size() ); + state.matparams.aE.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.bE.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.aH.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.bH.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.bc_coeffs.copyin( twos.data(), twos.size() ); + state.k1.resize( mod.num_dofs() ); + state.k2.resize( mod.num_dofs() ); + state.k3.resize( mod.num_dofs() ); + state.k4.resize( mod.num_dofs() ); + state.tmp.resize( mod.num_dofs() ); + + for (auto& e : mod) { entity_data_cpu ed; @@ -36,6 +51,9 @@ void init_from_model(const model& mod, solver_state_gpu& state) state.eds.push_back( std::move(ed) ); state.edgs.push_back( std::move(edg) ); } + + state.curr_time = 0.0; + state.curr_timestep = 0; } @@ -95,19 +113,84 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) void apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) { - timecounter_gpu tc; - tc.tic(); - compute_curls(state, curr, next); compute_fluxes(state, curr, next); +} - double time = tc.toc(); - std::cout << curr.num_dofs/time << " DOF/s" << std::endl; +void +compute_euler_update(solver_state_gpu& state, const field_gpu& y, + const field_gpu& k, double dt, field_gpu& out) +{ + double *r = state.matparams.sigma_over_epsilon.data(); + double *eps = state.matparams.inv_epsilon.data(); + double *mu = state.matparams.inv_mu.data(); + + auto outp = out.data(); + auto yp = y.data(); + auto kp = k.data(); + + gpu_compute_euler_update(outp.Ex, yp.Ex, kp.Ex, r, eps, out.num_dofs, dt); + gpu_compute_euler_update(outp.Ey, yp.Ey, kp.Ey, r, eps, out.num_dofs, dt); + gpu_compute_euler_update(outp.Ez, yp.Ez, kp.Ez, r, eps, out.num_dofs, dt); + gpu_compute_euler_update(outp.Hx, yp.Hx, kp.Hx, mu, out.num_dofs, dt); + gpu_compute_euler_update(outp.Hy, yp.Hy, kp.Hy, mu, out.num_dofs, dt); + gpu_compute_euler_update(outp.Hz, yp.Hz, kp.Hz, mu, out.num_dofs, dt); +} + +void +compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in, + double dt, field_gpu& out) +{ + double *r = state.matparams.sigma_over_epsilon.data(); + double *eps = state.matparams.inv_epsilon.data(); + double *mu = state.matparams.inv_mu.data(); + + auto inp = in.data(); + auto outp = out.data(); + auto k1p = state.k1.data(); + auto k2p = state.k2.data(); + auto k3p = state.k3.data(); + auto k4p = state.k4.data(); + + gpu_compute_rk4_weighted_sum(outp.Ex, inp.Ex, k1p.Ex, k2p.Ex, k3p.Ex, k4p.Ex, r, eps, out.num_dofs, dt); + gpu_compute_rk4_weighted_sum(outp.Ey, inp.Ey, k1p.Ey, k2p.Ey, k3p.Ey, k4p.Ey, r, eps, out.num_dofs, dt); + gpu_compute_rk4_weighted_sum(outp.Ez, inp.Ez, k1p.Ez, k2p.Ez, k3p.Ez, k4p.Ez, r, eps, out.num_dofs, dt); + gpu_compute_rk4_weighted_sum(outp.Hx, inp.Hx, k1p.Hx, k2p.Hx, k3p.Hx, k4p.Hx, mu, out.num_dofs, dt); + gpu_compute_rk4_weighted_sum(outp.Hy, inp.Hy, k1p.Hy, k2p.Hy, k3p.Hy, k4p.Hy, mu, out.num_dofs, dt); + gpu_compute_rk4_weighted_sum(outp.Hz, inp.Hz, k1p.Hz, k2p.Hz, k3p.Hz, k4p.Hz, mu, out.num_dofs, dt); } void timestep(solver_state_gpu& state) -{} +{ + timecounter_gpu tc; + tc.tic(); + + /* + apply_operator(state, state.emf_curr, state.tmp); + compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next); + */ + + apply_operator(state, state.emf_curr, state.k1); + + compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp); + apply_operator(state, state.tmp, state.k2); + + compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp); + apply_operator(state, state.tmp, state.k3); + + compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp); + apply_operator(state, state.tmp, state.k4); + + compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next); + + state.curr_time += state.delta_t; + state.curr_timestep += 1; + + double time = tc.toc(); + double dofs_per_sec = 6*state.emf_curr.num_dofs/time; + std::cout << "Timestep " << state.curr_timestep << ", " << dofs_per_sec << " DOFs/s" << std::endl; +} void diff --git a/src/maxwell_solver.cpp b/src/maxwell_solver.cpp index c4d300119a555b118af48aa71d655eb693ddb58c..e8c6208ba97e85d91e3a193f9e6dbf46aa3e49cf 100644 --- a/src/maxwell_solver.cpp +++ b/src/maxwell_solver.cpp @@ -41,11 +41,10 @@ void test_it(const model& mod, State& state) maxwell::init_from_model(mod, state); maxwell::init_E_field(mod, state, E); - //maxwell::apply_operator(state, state.emf_curr, state.emf_next); state.delta_t = 0.001; - for(size_t i = 0; i < 1000; i++) + for(size_t i = 0; i < 10000; i++) { timestep(state); std::stringstream ss;