Skip to content
Snippets Groups Projects
Commit 83d6b8c7 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

GPU solver working again.

parent 065ff4f2
Branches
Tags
No related merge requests found
...@@ -225,6 +225,7 @@ public: ...@@ -225,6 +225,7 @@ public:
(void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) ); (void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
m_size = other.m_size; m_size = other.m_size;
return *this;
} }
void zero(void) void zero(void)
......
...@@ -148,4 +148,22 @@ gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F, ...@@ -148,4 +148,22 @@ gpu_compute_field_derivatives(const entity_data_gpu& edg, const double *F,
void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *); void gpu_compute_flux_lifting(entity_data_gpu&, const double *, double *);
void gpu_subtract(double *, const double *, const double *, size_t); void gpu_subtract(double *, const double *, const double *, size_t);
\ No newline at end of file
void
gpu_compute_euler_update(double *out, const double *y, const double *k,
const double *reaction, const double *material, size_t num_dofs, double dt);
void
gpu_compute_euler_update(double *out, const double *y, const double *k,
const double *material, size_t num_dofs, double dt);
void
gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
const double *k2, const double *k3, const double *k4,
const double *material, size_t num_dofs, double dt);
void
gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
const double *k2, const double *k3, const double *k4, const double *reaction,
const double *material, size_t num_dofs, double dt);
\ No newline at end of file
...@@ -163,6 +163,12 @@ struct solver_state_gpu ...@@ -163,6 +163,12 @@ struct solver_state_gpu
field_gpu dx; field_gpu dx;
field_gpu dy; field_gpu dy;
field_gpu dz; field_gpu dz;
field_gpu k1;
field_gpu k2;
field_gpu k3;
field_gpu k4;
field_gpu tmp;
}; };
void init_from_model(const model&, solver_state&); void init_from_model(const model&, solver_state&);
......
...@@ -306,4 +306,125 @@ gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs) ...@@ -306,4 +306,125 @@ gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs)
gs += 1; gs += 1;
gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs); gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs);
}
void __global__
gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
const double * __restrict__ k, size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = y[dof] + dt*k[dof];
}
void __global__
gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
const double * __restrict__ k, const double * __restrict__ material,
size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = y[dof] + dt*k[dof]*material[dof];
}
void
gpu_compute_euler_update(double *out, const double *y, const double *k,
const double *material, size_t num_dofs, double dt)
{
static const size_t EULER_THREADS = 256;
auto gs = num_dofs/EULER_THREADS;
if (num_dofs % EULER_THREADS != 0)
gs += 1;
gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, material, num_dofs, dt);
}
void __global__
gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
const double * __restrict__ k, const double * __restrict__ reaction,
const double * __restrict__ material, size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = y[dof]*(1-dt*reaction[dof]) + dt*k[dof]*material[dof];
}
void
gpu_compute_euler_update(double *out, const double *y, const double *k,
const double *reaction, const double *material, size_t num_dofs, double dt)
{
static const size_t EULER_THREADS = 256;
auto gs = num_dofs/EULER_THREADS;
if (num_dofs % EULER_THREADS != 0)
gs += 1;
gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, reaction, material, num_dofs, dt);
}
void __global__
gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
const double * __restrict__ k1, const double * __restrict__ k2,
const double * __restrict__ k3, const double * __restrict__ k4,
size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = in[dof] + dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
}
void __global__
gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
const double * __restrict__ k1, const double * __restrict__ k2,
const double * __restrict__ k3, const double * __restrict__ k4,
const double * __restrict__ material,
size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = in[dof] + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
}
void
gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
const double *k2, const double *k3, const double *k4,
const double *material, size_t num_dofs, double dt)
{
static const size_t RK4_THREADS = 256;
auto gs = num_dofs/RK4_THREADS;
if (num_dofs % RK4_THREADS != 0)
gs += 1;
gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
material, num_dofs, dt);
}
void __global__
gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
const double * __restrict__ k1, const double * __restrict__ k2,
const double * __restrict__ k3, const double * __restrict__ k4,
const double * __restrict__ reaction, const double * __restrict__ material,
size_t num_dofs, double dt)
{
int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
if (dof < num_dofs)
out[dof] = in[dof]*(1-dt*reaction[dof]) + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
}
void
gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
const double *k2, const double *k3, const double *k4, const double *reaction,
const double *material, size_t num_dofs, double dt)
{
static const size_t RK4_THREADS = 256;
auto gs = num_dofs/RK4_THREADS;
if (num_dofs % RK4_THREADS != 0)
gs += 1;
gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
reaction, material, num_dofs, dt);
} }
\ No newline at end of file
...@@ -16,18 +16,33 @@ void init_from_model(const model& mod, solver_state_gpu& state) ...@@ -16,18 +16,33 @@ void init_from_model(const model& mod, solver_state_gpu& state)
state.dx.resize( mod.num_dofs() ); state.dx.resize( mod.num_dofs() );
state.dy.resize( mod.num_dofs() ); state.dy.resize( mod.num_dofs() );
state.dz.resize( mod.num_dofs() ); state.dz.resize( mod.num_dofs() );
state.jumps.resize( mod.num_fluxes() ); state.jumps.resize( mod.num_fluxes() );
state.fluxes.resize( mod.num_fluxes() ); state.fluxes.resize( mod.num_fluxes() );
std::vector<double> zeros( mod.num_dofs(), 0.0 );
std::vector<double> onehalfs( mod.num_fluxes(), 0.5 ); std::vector<double> onehalfs( mod.num_fluxes(), 0.5 );
std::vector<double> ones( mod.num_dofs(), 1.0 );
std::vector<double> twos( mod.num_fluxes(), 2.0 ); std::vector<double> twos( mod.num_fluxes(), 2.0 );
state.matparams.inv_epsilon.copyin( ones.data(), ones.size() );
state.matparams.inv_mu.copyin( ones.data(), ones.size() );
state.matparams.sigma.copyin( zeros.data(), zeros.size() );
state.matparams.sigma_over_epsilon.copyin( zeros.data(), zeros.size() );
state.matparams.aE.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.aE.copyin( onehalfs.data(), onehalfs.size() );
state.matparams.bE.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.bE.copyin( onehalfs.data(), onehalfs.size() );
state.matparams.aH.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.aH.copyin( onehalfs.data(), onehalfs.size() );
state.matparams.bH.copyin( onehalfs.data(), onehalfs.size() ); state.matparams.bH.copyin( onehalfs.data(), onehalfs.size() );
state.matparams.bc_coeffs.copyin( twos.data(), twos.size() ); state.matparams.bc_coeffs.copyin( twos.data(), twos.size() );
state.k1.resize( mod.num_dofs() );
state.k2.resize( mod.num_dofs() );
state.k3.resize( mod.num_dofs() );
state.k4.resize( mod.num_dofs() );
state.tmp.resize( mod.num_dofs() );
for (auto& e : mod) for (auto& e : mod)
{ {
entity_data_cpu ed; entity_data_cpu ed;
...@@ -36,6 +51,9 @@ void init_from_model(const model& mod, solver_state_gpu& state) ...@@ -36,6 +51,9 @@ void init_from_model(const model& mod, solver_state_gpu& state)
state.eds.push_back( std::move(ed) ); state.eds.push_back( std::move(ed) );
state.edgs.push_back( std::move(edg) ); state.edgs.push_back( std::move(edg) );
} }
state.curr_time = 0.0;
state.curr_timestep = 0;
} }
...@@ -95,19 +113,84 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out) ...@@ -95,19 +113,84 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out)
void void
apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next) apply_operator(solver_state_gpu& state, const field_gpu& curr, field_gpu& next)
{ {
timecounter_gpu tc;
tc.tic();
compute_curls(state, curr, next); compute_curls(state, curr, next);
compute_fluxes(state, curr, next); compute_fluxes(state, curr, next);
}
double time = tc.toc(); void
std::cout << curr.num_dofs/time << " DOF/s" << std::endl; compute_euler_update(solver_state_gpu& state, const field_gpu& y,
const field_gpu& k, double dt, field_gpu& out)
{
double *r = state.matparams.sigma_over_epsilon.data();
double *eps = state.matparams.inv_epsilon.data();
double *mu = state.matparams.inv_mu.data();
auto outp = out.data();
auto yp = y.data();
auto kp = k.data();
gpu_compute_euler_update(outp.Ex, yp.Ex, kp.Ex, r, eps, out.num_dofs, dt);
gpu_compute_euler_update(outp.Ey, yp.Ey, kp.Ey, r, eps, out.num_dofs, dt);
gpu_compute_euler_update(outp.Ez, yp.Ez, kp.Ez, r, eps, out.num_dofs, dt);
gpu_compute_euler_update(outp.Hx, yp.Hx, kp.Hx, mu, out.num_dofs, dt);
gpu_compute_euler_update(outp.Hy, yp.Hy, kp.Hy, mu, out.num_dofs, dt);
gpu_compute_euler_update(outp.Hz, yp.Hz, kp.Hz, mu, out.num_dofs, dt);
}
void
compute_rk4_weighted_sum(solver_state_gpu& state, const field_gpu& in,
double dt, field_gpu& out)
{
double *r = state.matparams.sigma_over_epsilon.data();
double *eps = state.matparams.inv_epsilon.data();
double *mu = state.matparams.inv_mu.data();
auto inp = in.data();
auto outp = out.data();
auto k1p = state.k1.data();
auto k2p = state.k2.data();
auto k3p = state.k3.data();
auto k4p = state.k4.data();
gpu_compute_rk4_weighted_sum(outp.Ex, inp.Ex, k1p.Ex, k2p.Ex, k3p.Ex, k4p.Ex, r, eps, out.num_dofs, dt);
gpu_compute_rk4_weighted_sum(outp.Ey, inp.Ey, k1p.Ey, k2p.Ey, k3p.Ey, k4p.Ey, r, eps, out.num_dofs, dt);
gpu_compute_rk4_weighted_sum(outp.Ez, inp.Ez, k1p.Ez, k2p.Ez, k3p.Ez, k4p.Ez, r, eps, out.num_dofs, dt);
gpu_compute_rk4_weighted_sum(outp.Hx, inp.Hx, k1p.Hx, k2p.Hx, k3p.Hx, k4p.Hx, mu, out.num_dofs, dt);
gpu_compute_rk4_weighted_sum(outp.Hy, inp.Hy, k1p.Hy, k2p.Hy, k3p.Hy, k4p.Hy, mu, out.num_dofs, dt);
gpu_compute_rk4_weighted_sum(outp.Hz, inp.Hz, k1p.Hz, k2p.Hz, k3p.Hz, k4p.Hz, mu, out.num_dofs, dt);
} }
void void
timestep(solver_state_gpu& state) timestep(solver_state_gpu& state)
{} {
timecounter_gpu tc;
tc.tic();
/*
apply_operator(state, state.emf_curr, state.tmp);
compute_euler_update(state, state.emf_curr, state.tmp, state.delta_t, state.emf_next);
*/
apply_operator(state, state.emf_curr, state.k1);
compute_euler_update(state, state.emf_curr, state.k1, 0.5*state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k2);
compute_euler_update(state, state.emf_curr, state.k2, 0.5*state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k3);
compute_euler_update(state, state.emf_curr, state.k3, state.delta_t, state.tmp);
apply_operator(state, state.tmp, state.k4);
compute_rk4_weighted_sum(state, state.emf_curr, state.delta_t, state.emf_next);
state.curr_time += state.delta_t;
state.curr_timestep += 1;
double time = tc.toc();
double dofs_per_sec = 6*state.emf_curr.num_dofs/time;
std::cout << "Timestep " << state.curr_timestep << ", " << dofs_per_sec << " DOFs/s" << std::endl;
}
void void
......
...@@ -41,11 +41,10 @@ void test_it(const model& mod, State& state) ...@@ -41,11 +41,10 @@ void test_it(const model& mod, State& state)
maxwell::init_from_model(mod, state); maxwell::init_from_model(mod, state);
maxwell::init_E_field(mod, state, E); maxwell::init_E_field(mod, state, E);
//maxwell::apply_operator(state, state.emf_curr, state.emf_next);
state.delta_t = 0.001; state.delta_t = 0.001;
for(size_t i = 0; i < 1000; i++) for(size_t i = 0; i < 10000; i++)
{ {
timestep(state); timestep(state);
std::stringstream ss; std::stringstream ss;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment