Skip to content
Snippets Groups Projects
Commit 8a634736 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

One-shot flux lifting.

parent 4e9a5c3c
No related branches found
No related tags found
No related merge requests found
......@@ -366,6 +366,9 @@ void timestep(const model&, solver_state_gpu&, const parameter_loader&, time_int
void prepare_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
void do_sources(const model&, maxwell::solver_state_gpu&, maxwell::parameter_loader&);
void swap(maxwell::solver_state_gpu&, const parameter_loader&);
void
gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
field_gpu::raw_ptrs& out, cudaStream_t stream);
#ifdef USE_MPI
void gather_field(const model&, maxwell::field_gpu&, int, MPI_Comm);
#endif /* USE_MPI */
......
......@@ -10,4 +10,4 @@ Physical Surface("pmc", 11) = {3,4,8,9};
Physical Surface("pec", 12) = {5,6,10,11};
Physical Surface("abc", 13) = {7};
MeshSize{:} = 0.05;
MeshSize{:} = 0.02;
......@@ -4,10 +4,10 @@
sim.name = "twomat" -- simulation name
sim.dt = 1e-12 -- timestep size
sim.timesteps = 20000 -- num of iterations
sim.gmsh_model = "twomat.geo" -- gmsh model filename
sim.use_gpu = 0 -- 0: cpu, 1: gpu
sim.approx_order = 1 -- approximation order
sim.time_integrator = "leapfrog"
sim.gmsh_model = "parallel_plate.geo" -- gmsh model filename
sim.use_gpu = 1 -- 0: cpu, 1: gpu
sim.approx_order = 3 -- approximation order
sim.time_integrator = "rk4"
postpro.silo_output_rate = 100
postpro.cycle_print_rate = 10 -- console print rate
......
......@@ -374,12 +374,15 @@ compute_fluxes(solver_state_gpu& state, const field_gpu& in, field_gpu& out, boo
for (auto& edg : state.edgs)
{
gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Ex, outp.Ex, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Ey, outp.Ey, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Ez, outp.Ez, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Hx, outp.Hx, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Hy, outp.Hy, state.compute_stream);
//gpu_compute_flux_lifting(edg, fluxesp.Hz, outp.Hz, state.compute_stream);
gpu_compute_flux_lifting_oneshot(edg, fluxesp, outp, state.compute_stream);
}
}
......
......@@ -417,4 +417,119 @@ decompress_bndsrc(const solver_state_gpu& state, const field_gpu& csrcs, field_g
state.bndsrcs_decomp_table.data(), csrcs.data(), srcs.data());
}
template<size_t K>
__global__ void
gpu_lift_planar_oneshot(field_gpu::raw_ptrs flux, gpuTextureObject_t LM_tex,
const double * __restrict__ dets, field_gpu::raw_ptrs lifted_flux,
int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
{
/* This kernel saturates the texture cache bandwidth (~1 TB/s on K20x!!)
* and thus it does not achieve good performance. This happens because
* the lifting matrix entries are not reused sufficiently. Sufficient
* reuse should be achieved by using a single entry to multiply different
* rhs instead of only one at a time. Orientations however make the
* implementation complicated, so for now the kernel remains slow. */
using KS = kernel_gpu_sizes<K>;
/* One thread per *output* dof */
int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
if (ofs_in_entity >= num_all_elems*KS::num_bf)
return;
int32_t cur_dof_offset = dof_base + ofs_in_entity;
int32_t iT = ofs_in_entity / KS::num_bf;
int32_t elem_flux_base = flux_base + 4*iT*KS::num_fluxes;
int32_t iO = orients[iT];
int32_t LM_row = ofs_in_entity % KS::num_bf;
int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO;
double inv_det = 1./dets[iT];
int32_t delta = ofs_in_entity % KS::num_bf;
double acc_Ex = 0.0;
double acc_Ey = 0.0;
double acc_Ez = 0.0;
double acc_Hx = 0.0;
double acc_Hy = 0.0;
double acc_Hz = 0.0;
for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
{
const int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
const int32_t f_ofs = elem_flux_base + dof;
const double L = inv_det * fetch_tex(LM_tex, l_ofs);
acc_Ex += L * flux.Ex[f_ofs];
acc_Ey += L * flux.Ey[f_ofs];
acc_Ez += L * flux.Ez[f_ofs];
acc_Hx += L * flux.Hx[f_ofs];
acc_Hy += L * flux.Hy[f_ofs];
acc_Hz += L * flux.Hz[f_ofs];
}
lifted_flux.Ex[cur_dof_offset] += acc_Ex;
lifted_flux.Ey[cur_dof_offset] += acc_Ey;
lifted_flux.Ez[cur_dof_offset] += acc_Ez;
lifted_flux.Hx[cur_dof_offset] += acc_Hx;
lifted_flux.Hy[cur_dof_offset] += acc_Hy;
lifted_flux.Hz[cur_dof_offset] += acc_Hz;
}
template<size_t K>
void
launch_lift_kernel_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
field_gpu::raw_ptrs& out, cudaStream_t stream)
{
const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
auto Ltex = edg.lifting_matrices.get_texture();
int32_t num_elems = edg.num_all_elems;
auto orients = edg.element_orientation.data();
auto num_orients = edg.element_orientation.size();
auto dets = edg.cell_determinants.data();
if (edg.g_order == 1)
gpu_lift_planar_oneshot<K><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(f, Ltex,
dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
void
gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
field_gpu::raw_ptrs& out, cudaStream_t stream)
{
switch (edg.a_order)
{
case 1:
launch_lift_kernel_oneshot<1>(edg, f, out, stream);
break;
case 2:
launch_lift_kernel_oneshot<2>(edg, f, out, stream);
break;
case 3:
launch_lift_kernel_oneshot<3>(edg, f, out, stream);
break;
case 4:
launch_lift_kernel_oneshot<4>(edg, f, out, stream);
break;
case 5:
launch_lift_kernel_oneshot<5>(edg, f, out, stream);
break;
case 6:
launch_lift_kernel_oneshot<6>(edg, f, out, stream);
break;
default:
throw std::invalid_argument("compute_field_derivatives: invalid order");
}
}
} // namespace maxwell
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment