Skip to content
Snippets Groups Projects
Commit 15378f6e authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Performance optimizations.

parent 6ed2b36e
No related branches found
No related tags found
No related merge requests found
...@@ -314,7 +314,7 @@ void init_from_model(const model&, solver_state&); ...@@ -314,7 +314,7 @@ void init_from_model(const model&, solver_state&);
void init_matparams(const model&, solver_state&, const parameter_loader&); void init_matparams(const model&, solver_state&, const parameter_loader&);
//void apply_operator(solver_state&, const field&, field&); //void apply_operator(solver_state&, const field&, field&);
void export_fields_to_silo(const model&, const solver_state&, const parameter_loader&); void export_fields_to_silo(const model&, const solver_state&, const parameter_loader&);
void timestep(solver_state&, time_integrator_type); void timestep(solver_state&, const parameter_loader&, time_integrator_type);
void prepare_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&); void prepare_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
void do_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&); void do_sources(const model&, maxwell::solver_state&, maxwell::parameter_loader&);
void swap(maxwell::solver_state&); void swap(maxwell::solver_state&);
...@@ -324,7 +324,7 @@ void init_from_model(const model&, solver_state_gpu&); ...@@ -324,7 +324,7 @@ void init_from_model(const model&, solver_state_gpu&);
void init_matparams(const model&, solver_state_gpu&, const parameter_loader&); void init_matparams(const model&, solver_state_gpu&, const parameter_loader&);
//void apply_operator(solver_state_gpu&, const field_gpu&, field_gpu&); //void apply_operator(solver_state_gpu&, const field_gpu&, field_gpu&);
void export_fields_to_silo(const model&, const solver_state_gpu&, const parameter_loader&); void export_fields_to_silo(const model&, const solver_state_gpu&, const parameter_loader&);
void timestep(solver_state_gpu&, time_integrator_type); void timestep(solver_state_gpu&, const parameter_loader&, time_integrator_type);
void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&); void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&); void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
void swap(maxwell::solver_state_gpu&); void swap(maxwell::solver_state_gpu&);
...@@ -398,13 +398,13 @@ void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&, ...@@ -398,13 +398,13 @@ void gpu_compute_jumps_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
double *, gpuStream_t stream = 0); double *, gpuStream_t stream = 0);
void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&, void gpu_compute_fluxes(const entity_data_gpu&, const field_gpu&, field_gpu&,
const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&, void gpu_compute_fluxes_E(const entity_data_gpu&, const field_gpu&, field_gpu&,
const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&, void gpu_compute_fluxes_H(const entity_data_gpu&, const field_gpu&, field_gpu&,
const field_gpu&, const material_params_gpu&, gpuStream_t stream = 0); const field_gpu&, const material_params_gpu&, bool, gpuStream_t stream = 0);
void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, void decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs,
field_gpu& srcs); field_gpu& srcs);
......
...@@ -227,7 +227,7 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex, ...@@ -227,7 +227,7 @@ gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
{ {
int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes); int32_t f_ofs = elem_flux_base + dof;
acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs); acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
} }
...@@ -304,7 +304,7 @@ void ...@@ -304,7 +304,7 @@ void
gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs, gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs,
cudaStream_t stream) cudaStream_t stream)
{ {
static const size_t SUBTRACT_THREADS = 256; static const size_t SUBTRACT_THREADS = 128;
auto gs = num_dofs/SUBTRACT_THREADS; auto gs = num_dofs/SUBTRACT_THREADS;
if (num_dofs % SUBTRACT_THREADS != 0) if (num_dofs % SUBTRACT_THREADS != 0)
gs += 1; gs += 1;
...@@ -326,7 +326,7 @@ void ...@@ -326,7 +326,7 @@ void
gpu_curl(double *dst, const double *add, const double *sub, const double *source, gpu_curl(double *dst, const double *add, const double *sub, const double *source,
size_t num_dofs, cudaStream_t stream) size_t num_dofs, cudaStream_t stream)
{ {
static const size_t SUBTRACT_THREADS = 256; static const size_t SUBTRACT_THREADS = 128;
auto gs = num_dofs/SUBTRACT_THREADS; auto gs = num_dofs/SUBTRACT_THREADS;
if (num_dofs % SUBTRACT_THREADS != 0) if (num_dofs % SUBTRACT_THREADS != 0)
gs += 1; gs += 1;
...@@ -468,8 +468,9 @@ gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict ...@@ -468,8 +468,9 @@ gpu_leapfrog_update_kernel(double * __restrict__ next, const double * __restrict
if (dof < num_dofs) if (dof < num_dofs)
{ {
double dt_sigma_over_2eps = 0.5*dt*soe[dof]; double dt_sigma_over_2eps = 0.5*dt*soe[dof];
double CR = (1.0 - dt_sigma_over_2eps)/(1.0 + dt_sigma_over_2eps); double k = 1./(1.0 + dt_sigma_over_2eps);
double CC = dt*ie[dof]/(1.0 + dt_sigma_over_2eps); double CR = (1.0 - dt_sigma_over_2eps)*k;
double CC = dt*ie[dof]*k;
next[dof] = curr[dof]*CR + tmp[dof]*CC; next[dof] = curr[dof]*CR + tmp[dof]*CC;
} }
...@@ -480,7 +481,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe, ...@@ -480,7 +481,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, const double *soe,
const double *tmp, const double *ie, size_t num_dofs, double dt, const double *tmp, const double *ie, size_t num_dofs, double dt,
cudaStream_t stream) cudaStream_t stream)
{ {
static const size_t EULER_THREADS = 256; static const size_t EULER_THREADS = 128;
auto gs = num_dofs/EULER_THREADS; auto gs = num_dofs/EULER_THREADS;
if (num_dofs % EULER_THREADS != 0) if (num_dofs % EULER_THREADS != 0)
...@@ -508,7 +509,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr, ...@@ -508,7 +509,7 @@ gpu_compute_leapfrog_update(double *next, const double *curr,
const double *tmp, const double *im, size_t num_dofs, double dt, const double *tmp, const double *im, size_t num_dofs, double dt,
cudaStream_t stream) cudaStream_t stream)
{ {
static const size_t EULER_THREADS = 256; static const size_t EULER_THREADS = 128;
auto gs = num_dofs/EULER_THREADS; auto gs = num_dofs/EULER_THREADS;
if (num_dofs % EULER_THREADS != 0) if (num_dofs % EULER_THREADS != 0)
......
...@@ -505,7 +505,7 @@ leapfrog(solver_state& state) ...@@ -505,7 +505,7 @@ leapfrog(solver_state& state)
} }
void void
timestep(solver_state& state, time_integrator_type ti) timestep(solver_state& state, const parameter_loader&, time_integrator_type ti)
{ {
if (ti == time_integrator_type::EULER) if (ti == time_integrator_type::EULER)
{ {
......
...@@ -150,7 +150,7 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field& ...@@ -150,7 +150,7 @@ compress_bndsrc(const solver_state_gpu& state, const field& srcs, pinned_field&
} }
static void static void
compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
{ {
auto currp = curr.data(); auto currp = curr.data();
auto nextp = next.data(); auto nextp = next.data();
...@@ -172,9 +172,19 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) ...@@ -172,9 +172,19 @@ compute_curls(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
auto Jy = state.Jy_src_gpu.data(); auto Jy = state.Jy_src_gpu.data();
auto Jz = state.Jz_src_gpu.data(); auto Jz = state.Jz_src_gpu.data();
if (use_sources)
{
gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
}
else
{
gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
}
gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Hx, dzp.Ey, dyp.Ez, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Hy, dxp.Ez, dzp.Ex, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Hz, dyp.Ex, dxp.Ey, nextp.num_dofs, state.compute_stream);
...@@ -203,7 +213,7 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) ...@@ -203,7 +213,7 @@ compute_curls_E(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
} }
static void static void
compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next, bool use_sources)
{ {
auto currp = curr.data(); auto currp = curr.data();
auto nextp = next.data(); auto nextp = next.data();
...@@ -218,17 +228,26 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) ...@@ -218,17 +228,26 @@ compute_curls_H(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream); gpu_compute_field_derivatives(edg, currp.Hz, dxp.Hz, dyp.Hz, dzp.Hz, 1.0, state.compute_stream);
} }
if (use_sources)
{
auto Jx = state.Jx_src_gpu.data(); auto Jx = state.Jx_src_gpu.data();
auto Jy = state.Jy_src_gpu.data(); auto Jy = state.Jy_src_gpu.data();
auto Jz = state.Jz_src_gpu.data(); auto Jz = state.Jz_src_gpu.data();
gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, Jx, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, Jy, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream); gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, Jz, nextp.num_dofs, state.compute_stream);
} }
else
{
gpu_curl(nextp.Ex, dyp.Hz, dzp.Hy, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ey, dzp.Hx, dxp.Hz, nextp.num_dofs, state.compute_stream);
gpu_curl(nextp.Ez, dxp.Hy, dyp.Hx, nextp.num_dofs, state.compute_stream);
}
}
static void static void
compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{ {
auto matparams_ptrs = state.matparams.data(); auto matparams_ptrs = state.matparams.data();
double *bc_coeffs = matparams_ptrs.bc_coeffs; double *bc_coeffs = matparams_ptrs.bc_coeffs;
...@@ -238,7 +257,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -238,7 +257,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
for (const auto& edg : state.edgs) for (const auto& edg : state.edgs)
maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs, maxwell::gpu_compute_fluxes(edg, state.jumps, state.fluxes, state.bndsrcs,
state.matparams, state.compute_stream); state.matparams, use_sources, state.compute_stream);
auto fluxesp = state.fluxes.data(); auto fluxesp = state.fluxes.data();
auto outp = out.data(); auto outp = out.data();
...@@ -255,7 +274,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -255,7 +274,7 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
} }
static void static void
compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{ {
auto matparams_ptrs = state.matparams.data(); auto matparams_ptrs = state.matparams.data();
double *bc_coeffs = matparams_ptrs.bc_coeffs; double *bc_coeffs = matparams_ptrs.bc_coeffs;
...@@ -265,7 +284,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -265,7 +284,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
for (const auto& edg : state.edgs) for (const auto& edg : state.edgs)
maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs, maxwell::gpu_compute_fluxes_E(edg, state.jumps, state.fluxes, state.bndsrcs,
state.matparams, state.compute_stream); state.matparams, use_sources, state.compute_stream);
auto fluxesp = state.fluxes.data(); auto fluxesp = state.fluxes.data();
auto outp = out.data(); auto outp = out.data();
...@@ -279,7 +298,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -279,7 +298,7 @@ compute_fluxes_E(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
} }
static void static void
compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out, bool use_sources)
{ {
auto matparams_ptrs = state.matparams.data(); auto matparams_ptrs = state.matparams.data();
double *bc_coeffs = matparams_ptrs.bc_coeffs; double *bc_coeffs = matparams_ptrs.bc_coeffs;
...@@ -289,7 +308,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -289,7 +308,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
for (const auto& edg : state.edgs) for (const auto& edg : state.edgs)
maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs, maxwell::gpu_compute_fluxes_H(edg, state.jumps, state.fluxes, state.bndsrcs,
state.matparams, state.compute_stream); state.matparams, use_sources, state.compute_stream);
auto fluxesp = state.fluxes.data(); auto fluxesp = state.fluxes.data();
auto outp = out.data(); auto outp = out.data();
...@@ -303,7 +322,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -303,7 +322,7 @@ compute_fluxes_H(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
} }
static void static void
leapfrog(solver_state_gpu& state) leapfrog(solver_state_gpu& state, const parameter_loader& mpl)
{ {
double *r = state.matparams.sigma_over_epsilon.data(); double *r = state.matparams.sigma_over_epsilon.data();
double *eps = state.matparams.inv_epsilon.data(); double *eps = state.matparams.inv_epsilon.data();
...@@ -313,10 +332,14 @@ leapfrog(solver_state_gpu& state) ...@@ -313,10 +332,14 @@ leapfrog(solver_state_gpu& state)
auto nextp = state.emf_next.data(); auto nextp = state.emf_next.data();
auto tmpp = state.tmp.data(); auto tmpp = state.tmp.data();
auto dt = state.delta_t; bool ve = mpl.volume_sources_enabled();
compute_curls_H(state, state.emf_curr, state.tmp); compute_curls_H(state, state.emf_curr, state.tmp, ve);
compute_fluxes_H(state, state.emf_curr, state.tmp);
auto be = mpl.boundary_sources_enabled();
auto ie = mpl.interface_sources_enabled();
compute_fluxes_H(state, state.emf_curr, state.tmp, be or ie);
auto dt = state.delta_t;
gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs, gpu_compute_leapfrog_update(nextp.Ex, currp.Ex, r, tmpp.Ex, eps, nextp.num_dofs,
dt, state.compute_stream); dt, state.compute_stream);
gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs, gpu_compute_leapfrog_update(nextp.Ey, currp.Ey, r, tmpp.Ey, eps, nextp.num_dofs,
...@@ -325,7 +348,7 @@ leapfrog(solver_state_gpu& state) ...@@ -325,7 +348,7 @@ leapfrog(solver_state_gpu& state)
dt, state.compute_stream); dt, state.compute_stream);
compute_curls_E(state, state.emf_next, state.tmp); compute_curls_E(state, state.emf_next, state.tmp);
compute_fluxes_E(state, state.emf_next, state.tmp); compute_fluxes_E(state, state.emf_next, state.tmp, be or ie);
gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs, gpu_compute_leapfrog_update(nextp.Hx, currp.Hx, tmpp.Hx, mu, nextp.num_dofs,
dt, state.compute_stream); dt, state.compute_stream);
...@@ -336,10 +359,14 @@ leapfrog(solver_state_gpu& state) ...@@ -336,10 +359,14 @@ leapfrog(solver_state_gpu& state)
} }
static void static void
apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) apply_operator(solver_state_gpu& state, const parameter_loader& mpl, const field_gpu& curr,
field_gpu& next)
{ {
compute_curls(state, curr, next); bool ve = mpl.volume_sources_enabled();
compute_fluxes(state, curr, next); auto be = mpl.boundary_sources_enabled();
auto ie = mpl.interface_sources_enabled();
compute_curls(state, curr, next, ve);
compute_fluxes(state, curr, next, be or ie);
} }
static void static void
...@@ -386,36 +413,36 @@ compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in, ...@@ -386,36 +413,36 @@ compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
} }
void void
timestep(solver_state_gpu& state, time_integrator_type ti) timestep(solver_state_gpu& state, const parameter_loader& mpl, time_integrator_type ti)
{ {
//timecounter tc; //timecounter tc;
//tc.tic(); //tc.tic();
if (ti == time_integrator_type::EULER) if (ti == time_integrator_type::EULER)
{ {
apply_operator(state, state.emf_curr, state.tmp); apply_operator(state, mpl, state.emf_curr, state.tmp);
compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next); compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
} }
if (ti == time_integrator_type::RK4) if (ti == time_integrator_type::RK4)
{ {
apply_operator(state, state.emf_curr, state.k1); apply_operator(state, mpl, state.emf_curr, state.k1);
compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp); compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k2); apply_operator(state, mpl, state.tmp, state.k2);
compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp); compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k3); apply_operator(state, mpl, state.tmp, state.k3);
compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp); compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k4); apply_operator(state, mpl, state.tmp, state.k4);
compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next); compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
} }
if (ti == time_integrator_type::LEAPFROG) if (ti == time_integrator_type::LEAPFROG)
{ {
leapfrog(state); leapfrog(state, mpl);
} }
state.curr_time += state.delta_t; state.curr_time += state.delta_t;
...@@ -483,6 +510,9 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, ...@@ -483,6 +510,9 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
state.Jx_src_gpu.zero(); state.Jx_src_gpu.zero();
state.Jy_src_gpu.zero(); state.Jy_src_gpu.zero();
state.Jz_src_gpu.zero(); state.Jz_src_gpu.zero();
state.Jx_src_gpu_buf.zero();
state.Jy_src_gpu_buf.zero();
state.Jz_src_gpu_buf.zero();
state.bndsrcs_decomp_cpu.zero(); state.bndsrcs_decomp_cpu.zero();
state.bndsrcs_cpu.zero(); state.bndsrcs_cpu.zero();
mpl.source_was_cleared(); mpl.source_was_cleared();
...@@ -510,13 +540,18 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state, ...@@ -510,13 +540,18 @@ do_sources(const model& mod, maxwell::solver_state_gpu& state,
} }
void void
swap(maxwell::solver_state_gpu& state) swap(solver_state_gpu& state)
{ {
checkGPU( gpuDeviceSynchronize() ); checkGPU( gpuDeviceSynchronize() );
//auto be = mpl.boundary_sources_enabled();
//auto ie = mpl.interface_sources_enabled()
//if ( be or ie )
decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf); std::swap(state.Jx_src_gpu, state.Jx_src_gpu_buf);
std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf); std::swap(state.Jy_src_gpu, state.Jy_src_gpu_buf);
std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf); std::swap(state.Jz_src_gpu, state.Jz_src_gpu_buf);
decompress_bndsrc(state, state.bndsrcs_buf, state.bndsrcs);
std::swap(state.emf_curr, state.emf_next); std::swap(state.emf_curr, state.emf_next);
} }
......
...@@ -42,7 +42,7 @@ void ...@@ -42,7 +42,7 @@ void
gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream) double *bc_coeffs, cudaStream_t stream)
{ {
static const size_t JUMP_THREADS = 256; static const size_t JUMP_THREADS = 128;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
...@@ -90,7 +90,7 @@ void ...@@ -90,7 +90,7 @@ void
gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream) double *bc_coeffs, cudaStream_t stream)
{ {
static const size_t JUMP_THREADS = 256; static const size_t JUMP_THREADS = 128;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
...@@ -122,7 +122,7 @@ void ...@@ -122,7 +122,7 @@ void
gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps, gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream) double *bc_coeffs, cudaStream_t stream)
{ {
static const size_t JUMP_THREADS = 256; static const size_t JUMP_THREADS = 128;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
...@@ -150,12 +150,12 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& ...@@ -150,12 +150,12 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu&
checkGPU(cudaPeekAtLastError()); checkGPU(cudaPeekAtLastError());
} }
template<size_t K> template<size_t K, bool do_E, bool do_H>
__global__ void __global__ void
gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux, gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp, field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals, const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes) size_t flux_base, size_t num_fluxes, bool use_sources)
{ {
using KS = kernel_gpu_sizes<K>; using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x; auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
...@@ -171,112 +171,51 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ ...@@ -171,112 +171,51 @@ gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_
auto ny = face_normals[3*entry_num + 1]; auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2]; auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos]; auto jEx = jump.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos]; auto jEy = jump.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos]; auto jEz = jump.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos]; auto jHx = jump.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos]; auto jHy = jump.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos]; auto jHz = jump.Hz[global_dof_pos];
auto ndotE = nx*jEx + ny*jEy + nz*jEz; if (use_sources)
auto ndotH = nx*jHx + ny*jHy + nz*jHz; {
jEx += - bndsrc.Ex[global_dof_pos];
auto aE = face_det * fcp.aE[global_dof_pos]; jEy += - bndsrc.Ey[global_dof_pos];
auto bE = face_det * fcp.bE[global_dof_pos]; jEz += - bndsrc.Ez[global_dof_pos];
auto aH = face_det * fcp.aH[global_dof_pos]; jHx += + bndsrc.Hx[global_dof_pos];
auto bH = face_det * fcp.bH[global_dof_pos]; jHy += + bndsrc.Hy[global_dof_pos];
jHz += + bndsrc.Hz[global_dof_pos];
flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
} }
template<size_t K> if constexpr(do_E)
__global__ void
gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes)
{ {
using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
auto global_dof_pos = flux_base + local_dof_pos;
auto entry_num = local_dof_pos/KS::num_fluxes;
if (local_dof_pos >= num_fluxes)
return;
auto face_det = face_dets[entry_num];
auto nx = face_normals[3*entry_num + 0];
auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
auto ndotE = nx*jEx + ny*jEy + nz*jEz; auto ndotE = nx*jEx + ny*jEy + nz*jEz;
auto aE = face_det * fcp.aE[global_dof_pos]; auto aE = face_det * fcp.aE[global_dof_pos];
auto bE = face_det * fcp.bE[global_dof_pos]; auto bE = face_det * fcp.bE[global_dof_pos];
flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx); flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy); flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz); flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
} }
template<size_t K> if constexpr(do_H)
__global__ void
gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes)
{ {
using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
auto global_dof_pos = flux_base + local_dof_pos;
auto entry_num = local_dof_pos/KS::num_fluxes;
if (local_dof_pos >= num_fluxes)
return;
auto face_det = face_dets[entry_num];
auto nx = face_normals[3*entry_num + 0];
auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
auto ndotH = nx*jHx + ny*jHy + nz*jHz; auto ndotH = nx*jHx + ny*jHy + nz*jHz;
auto aH = face_det * fcp.aH[global_dof_pos]; auto aH = face_det * fcp.aH[global_dof_pos];
auto bH = face_det * fcp.bH[global_dof_pos]; auto bH = face_det * fcp.bH[global_dof_pos];
flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx); flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy); flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz); flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
} }
}
template<size_t K, typename Kernel> template<size_t K, typename Kernel>
void void
launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps, launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, bool use_sources,
cudaStream_t stream) cudaStream_t stream)
{ {
static const size_t FLUX_THREADS = 256; static const size_t FLUX_THREADS = 1024;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem; auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
...@@ -292,38 +231,50 @@ launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const f ...@@ -292,38 +231,50 @@ launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const f
kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp, kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
edg.face_determinants.data(), edg.normals.data(), edg.flux_base, edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
num_all_fluxes); num_all_fluxes, use_sources);
} }
void void
gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps, gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream) bool use_sources, cudaStream_t stream)
{ {
switch( edg.a_order ) switch( edg.a_order )
{ {
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>, case 1: launch_compute_fluxes_kernel<1>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<1, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>, case 2: launch_compute_fluxes_kernel<2>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<2, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>, case 3: launch_compute_fluxes_kernel<3>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<3, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>, case 4: launch_compute_fluxes_kernel<4>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<4, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>, case 5: launch_compute_fluxes_kernel<5>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<5, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>, case 6: launch_compute_fluxes_kernel<6>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<6, true, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
default: throw std::invalid_argument("unsupported order"); default: throw std::invalid_argument("unsupported order");
...@@ -333,32 +284,44 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps, ...@@ -333,32 +284,44 @@ gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
void void
gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps, gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream) bool use_sources, cudaStream_t stream)
{ {
switch( edg.a_order ) switch( edg.a_order )
{ {
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>, case 1: launch_compute_fluxes_kernel<1>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<1, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>, case 2: launch_compute_fluxes_kernel<2>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<2, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>, case 3: launch_compute_fluxes_kernel<3>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<3, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>, case 4: launch_compute_fluxes_kernel<4>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<4, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>, case 5: launch_compute_fluxes_kernel<5>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<5, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>, case 6: launch_compute_fluxes_kernel<6>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<6, true, false>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
default: throw std::invalid_argument("unsupported order"); default: throw std::invalid_argument("unsupported order");
...@@ -368,32 +331,44 @@ gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps, ...@@ -368,32 +331,44 @@ gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
void void
gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps, gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp, field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream) bool use_sources, cudaStream_t stream)
{ {
switch( edg.a_order ) switch( edg.a_order )
{ {
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>, case 1: launch_compute_fluxes_kernel<1>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<1, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>, case 2: launch_compute_fluxes_kernel<2>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<2, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>, case 3: launch_compute_fluxes_kernel<3>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<3, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>, case 4: launch_compute_fluxes_kernel<4>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<4, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>, case 5: launch_compute_fluxes_kernel<5>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<5, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>, case 6: launch_compute_fluxes_kernel<6>(
edg, jumps, fluxes, bndsrc, fcp, stream); gpu_compute_fluxes_kernel_planar<6, false, true>,
edg, jumps, fluxes, bndsrc, fcp, use_sources, stream
);
break; break;
default: throw std::invalid_argument("unsupported order"); default: throw std::invalid_argument("unsupported order");
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "maxwell_interface.h" #include "maxwell_interface.h"
#include "maxwell_common.h" #include "maxwell_common.h"
#include "param_loader.h" #include "param_loader.h"
#include "timecounter.h"
#if 0 #if 0
static void static void
...@@ -86,23 +87,28 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl) ...@@ -86,23 +87,28 @@ void test_it(const model& mod, State& state, maxwell::parameter_loader& mpl)
std::cout << "s each." << std::endl; std::cout << "s each." << std::endl;
std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl; std::cout << "Time integrator: " << mpl.sim_timeIntegratorName() << std::endl;
timecounter tc;
tc.tic();
prepare_sources(mod, state, mpl); prepare_sources(mod, state, mpl);
for(size_t i = 0; i < num_timesteps; i++) for(size_t i = 0; i < num_timesteps; i++)
{ {
mpl.call_timestep_callback(i); mpl.call_timestep_callback(i);
timestep(state, ti); timestep(state, mpl, ti);
do_sources(mod, state, mpl); do_sources(mod, state, mpl);
if (i%silo_output_rate == 0) if (i%silo_output_rate == 0)
export_fields_to_silo(mod, state, mpl); export_fields_to_silo(mod, state, mpl);
swap(state);
if (i%cycle_print_rate == 0) if (i%cycle_print_rate == 0)
{ {
double time = tc.toc();
std::cout << "Cycle " << i << ": t = " << state.curr_time << " s"; std::cout << "Cycle " << i << ": t = " << state.curr_time << " s";
std::cout << std::endl; std::cout << ", DOFs/s: " << (mod.num_dofs()*6*cycle_print_rate)/(time) << std::endl;
tc.tic();
} }
swap(state);
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment