Skip to content
Snippets Groups Projects
Commit 0227adb3 authored by Marco D'Antonio's avatar Marco D'Antonio
Browse files

Implemented jumps oneshot leapfrog

parent ab41840f
No related branches found
No related tags found
No related merge requests found
......@@ -474,6 +474,12 @@ void gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs&
void gpu_compute_jumps_oneshot(const entity_data_gpu& edg, const field_gpu& in,
field_gpu& jumps, double *bc_coeffs, gpuStream_t stream);
void gpu_compute_jumps_oneshot_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, gpuStream_t stream);
void gpu_compute_jumps_oneshot_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, gpuStream_t stream);
void gpu_jumps_to_ipc_E(solver_state_gpu&, size_t, size_t);
void gpu_ipc_to_jumps_E(solver_state_gpu&, size_t, size_t);
void gpu_jumps_swap_ipc_E(solver_state_gpu&, size_t, size_t);
......
......@@ -9,6 +9,9 @@
#include "libgmshdg/entity_data.h"
#include "libgmshdg/kernels_gpu.h"
// Flops count: (54*num_dofs+6)*num_all_dofs
// Mem accesses: (24*num_dofs+3)*num_all_dofs
template<size_t K>
__global__ void
gpu_curl_planar(const double *fx, const double *fy, const double *fz,
......@@ -78,6 +81,8 @@ gpu_curl_planar(const double *fx, const double *fy, const double *fz,
curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy);
}
// Flops count: (54*num_dofs+9)*num_all_dofs
// Mem accesses: (24*num_dofs+6)*num_all_dofs
template<size_t K>
__global__ void
gpu_curl_planar(const double *fx, const double *fy, const double *fz,
......@@ -151,6 +156,8 @@ gpu_curl_planar(const double *fx, const double *fy, const double *fz,
}
/* compute α∇F */
// Flops count: (21*num_dofs+3)*num_all_dofs
// Mem accesses: (15*num_dofs+3)
template<size_t K>
__global__ void
gpu_deriv_planar(const double *F, const double * __restrict__ J,
......@@ -227,8 +234,6 @@ launch_curl_kernel(const entity_data_gpu& edg,
if (edg.g_order == 1)
gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
J, Dtex, alpha, curl_fx, curl_fy, curl_fz, num_elems, orients, edg.dof_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
template<size_t K>
......@@ -255,8 +260,6 @@ launch_curl_kernel(const entity_data_gpu& edg,
gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
J, Dtex, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z,
num_elems, orients, edg.dof_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
template<size_t K>
......@@ -280,8 +283,6 @@ launch_deriv_kernel(const entity_data_gpu& edg,
if (edg.g_order == 1)
gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads, 0, stream>>>(f, J,
Dtex, df_dx, df_dy, df_dz, alpha, num_elems, orients, edg.dof_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
void
......@@ -458,8 +459,6 @@ launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out, cudaStrea
if (edg.g_order == 1)
gpu_lift_planar<K><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(f, Ltex,
dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
void
......@@ -493,10 +492,12 @@ gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out,
break;
default:
throw std::invalid_argument("compute_field_derivatives: invalid order");
throw std::invalid_argument("compute_flux_lifting: invalid order");
}
}
// Flops count: 1*num_all_dofs
// Mem accesse: 3*num_all_dofs
void __global__
gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add,
const double * __restrict__ sub, size_t num_dofs)
......@@ -518,6 +519,8 @@ gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs,
gpu_curl_kernel<<<gs, SUBTRACT_THREADS, 0, stream>>>(dst, add, sub, num_dofs);
}
// Flops count: 2*num_all_dofs
// Mem accesse: 4*num_all_dofs
void __global__
gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add,
const double * __restrict__ sub, const double * __restrict__ source,
......
......@@ -218,36 +218,30 @@ void init_profiling(const model& mod, solver_state_gpu& state, time_integrator_t
<< "num_field_dofs: " << num_field_dofs << std::endl
<< "num_field_fluxes: " << num_field_fluxes << std::endl;
auto jacobians_read = 3*3*num_cells;
auto dm_read = 3*cell_dofs*cell_dofs;
auto curl_readwrite = 6*num_field_dofs;
auto sources_application = 3*num_field_dofs;
if (ti == time_integrator_type::LEAPFROG)
{
#ifdef TM_NEW_CURLS
state.logger.register_profiler("compute_curls_H", profiler::type::GPU);
state.logger.register_profiler("compute_curls_E", profiler::type::GPU);
state.logger.register_profiler("compute_curls_H", profiler::type::GPU)
.set_quantity("GFlops", (54*cell_dofs+9)*num_field_dofs/1e9)
.set_quantity("GB", (24*cell_dofs+6)*num_field_dofs*8/1e9);
state.logger.register_profiler("compute_curls_E", profiler::type::GPU)
.set_quantity("GFlops", (54*cell_dofs+6)*num_field_dofs/1e9)
.set_quantity("GB", (24*cell_dofs+3)*num_field_dofs*8/1e9);
#else
auto derivatives = jacobians_read+dm_read+4*num_field_dofs;
auto curls_h = 4*num_field_dofs;
auto curls_e = 3*num_field_dofs;
state.logger.register_profiler("compute_curls_H", profiler::type::GPU);
state.logger.register_profiler("compute_curls_E", profiler::type::GPU);
state.logger.register_profiler("compute_curls_H", profiler::type::GPU)
.set_quantity("GFlops", ((21*cell_dofs+3)*num_field_dofs+2*num_field_dofs)*3/1e9)
.set_quantity("GB", ((15*cell_dofs+3)*num_field_dofs+4*num_field_dofs)*3*8/1e9);
state.logger.register_profiler("compute_curls_E", profiler::type::GPU)
.set_quantity("GFlops", ((21*cell_dofs+3)*num_field_dofs+num_field_dofs)*3/1e9)
.set_quantity("GB", ((15*cell_dofs+3)*num_field_dofs+3*num_field_dofs)*3*8/1e9);
#endif /* TM_NEW_CURLS */
state.logger.register_profiler("compute_field_jumps", profiler::type::GPU);
auto normals_read = num_all_faces;
auto dets_read = num_all_faces;
auto sources_imposition = 18*num_field_fluxes;
auto matparams_readwrite = 4*num_field_fluxes;
auto fluxes_write = 3*num_field_fluxes;
state.logger.register_profiler("compute_field_fluxes", profiler::type::GPU);
dets_read = num_cells;
auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
auto lifting_readwrite = num_field_fluxes+num_field_dofs;
state.logger.register_profiler("compute_flux_lifting", profiler::type::GPU);
}
else
......@@ -256,24 +250,13 @@ void init_profiling(const model& mod, solver_state_gpu& state, time_integrator_t
state.logger.register_profiler("compute_curls", profiler::type::GPU);
#else
auto derivatives = jacobians_read+dm_read+4*num_field_dofs;
auto curls = 4*num_field_dofs + 3*num_field_dofs;
state.logger.register_profiler("compute_curls", profiler::type::GPU);
#endif /* TM_NEW_CURLS */
state.logger.register_profiler("compute_field_jumps", profiler::type::GPU);
auto normals_read = num_all_faces;
auto dets_read = num_all_faces;
auto sources_imposition = 18*num_field_fluxes;
auto matparams_readwrite = 4*num_field_fluxes;
auto fluxes_write = 3*num_field_fluxes;
state.logger.register_profiler("compute_field_fluxes", profiler::type::GPU);
dets_read = num_cells;
auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
auto lifting_readwrite = num_field_fluxes+num_field_dofs;
state.logger.register_profiler("compute_flux_lifting", profiler::type::GPU);
}
}
......@@ -603,8 +586,17 @@ compute_fluxes_E(const model& mod, solver_state_gpu& state,
auto& jumps_profiler = state.logger.get_profiler("compute_field_jumps");
#endif /* ENABLE_PROFILING */
PROFILER_TIC(jumps_profiler);
for (const auto& edg : state.edgs)
{
#ifdef TM_ONESHOT_JUMPS
maxwell::gpu_compute_jumps_oneshot_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
#else
maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
#endif
}
PROFILER_TOC(jumps_profiler);
#ifdef USE_MPI
......@@ -688,8 +680,16 @@ compute_fluxes_H(const model& mod, solver_state_gpu& state,
auto& jumps_profiler = state.logger.get_profiler("compute_field_jumps");
#endif /* ENABLE_PROFILING */
PROFILER_TIC(jumps_profiler);
for (const auto& edg : state.edgs)
{
#ifdef TM_ONESHOT_JUMPS
maxwell::gpu_compute_jumps_oneshot_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
#else
maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
#endif
}
PROFILER_TOC(jumps_profiler);
#ifdef USE_MPI
......
......@@ -94,6 +94,8 @@ gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& ju
checkGPU(cudaPeekAtLastError());
}
// Flops count: 6*num_all_dofs
// Mem accesses: 18*num_all_dofs
__global__ void
gpu_compute_jumps_kernel_oneshot(field_gpu::const_raw_ptrs field,
const size_t * __restrict__ fluxdofs_mine,
......@@ -221,6 +223,125 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu&
checkGPU(cudaPeekAtLastError());
}
// Flops count: 3*num_all_dofs
// Mem accesses: 9*num_all_dofs
__global__ void
gpu_compute_jumps_kernel_oneshot_H(field_gpu::const_raw_ptrs field,
const size_t * __restrict__ fluxdofs_mine,
const size_t * __restrict__ fluxdofs_other,
field_gpu::raw_ptrs jumps,
const double * __restrict__ bcjc,
size_t base, size_t max)
{
int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
if (flux_ofs >= max)
return;
auto idx_mine = fluxdofs_mine[flux_ofs];
auto idx_neigh = fluxdofs_other[flux_ofs];
if (idx_mine == NOT_PRESENT)
return;
if (idx_neigh != NOT_PRESENT)
{
jumps.Hx[base + flux_ofs] = field.Hx[idx_mine] - field.Hx[idx_neigh];
jumps.Hy[base + flux_ofs] = field.Hy[idx_mine] - field.Hy[idx_neigh];
jumps.Hz[base + flux_ofs] = field.Hz[idx_mine] - field.Hz[idx_neigh];
}
else
{
double bc_coeff = bcjc[base + flux_ofs];
jumps.Hx[base + flux_ofs] = (2.0 - bc_coeff)*field.Hx[idx_mine];
jumps.Hy[base + flux_ofs] = (2.0 - bc_coeff)*field.Hy[idx_mine];
jumps.Hz[base + flux_ofs] = (2.0 - bc_coeff)*field.Hz[idx_mine];
}
}
// Flops count: 3*num_all_dofs
// Mem accesses: 9*num_all_dofs
__global__ void
gpu_compute_jumps_kernel_oneshot_E(field_gpu::const_raw_ptrs field,
const size_t * __restrict__ fluxdofs_mine,
const size_t * __restrict__ fluxdofs_other,
field_gpu::raw_ptrs jumps,
const double * __restrict__ bcjc,
size_t base, size_t max)
{
int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
if (flux_ofs >= max)
return;
auto idx_mine = fluxdofs_mine[flux_ofs];
auto idx_neigh = fluxdofs_other[flux_ofs];
if (idx_mine == NOT_PRESENT)
return;
if (idx_neigh != NOT_PRESENT)
{
jumps.Ex[base + flux_ofs] = field.Ex[idx_mine] - field.Ex[idx_neigh];
jumps.Ey[base + flux_ofs] = field.Ey[idx_mine] - field.Ey[idx_neigh];
jumps.Ez[base + flux_ofs] = field.Ez[idx_mine] - field.Ez[idx_neigh];
}
else
{
double bc_coeff = bcjc[base + flux_ofs];
jumps.Ex[base + flux_ofs] = bc_coeff*field.Ex[idx_mine];
jumps.Ey[base + flux_ofs] = bc_coeff*field.Ey[idx_mine];
jumps.Ez[base + flux_ofs] = bc_coeff*field.Ez[idx_mine];
}
}
void
gpu_compute_jumps_oneshot_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, gpuStream_t stream)
{
static const size_t JUMP_THREADS = 128;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/JUMP_THREADS;
if (num_all_fluxes % JUMP_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
dim3 threads_per_block(JUMP_THREADS);
/* Compute H-field jumps */
gpu_compute_jumps_kernel_oneshot_H<<<gs, threads_per_block, 0, stream>>>(in.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
}
void
gpu_compute_jumps_oneshot_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, gpuStream_t stream)
{
static const size_t JUMP_THREADS = 128;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/JUMP_THREADS;
if (num_all_fluxes % JUMP_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
dim3 threads_per_block(JUMP_THREADS);
/* Compute E-field jumps */
gpu_compute_jumps_kernel_oneshot_E<<<gs, threads_per_block, 0, stream>>>(in.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
}
template<size_t K, bool do_E, bool do_H>
__global__ void
gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
......@@ -492,12 +613,6 @@ gpu_lift_planar_oneshot(field_gpu::raw_ptrs flux, gpuTextureObject_t LM_tex,
const double * __restrict__ dets, field_gpu::raw_ptrs lifted_flux,
int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
{
/* This kernel saturates the texture cache bandwidth (~1 TB/s on K20x!!)
* and thus it does not achieve good performance. This happens because
* the lifting matrix entries are not reused sufficiently. Sufficient
* reuse should be achieved by using a single entry to multiply different
* rhs instead of only one at a time. Orientations however make the
* implementation complicated, so for now the kernel remains slow. */
using KS = kernel_gpu_sizes<K>;
/* One thread per *output* dof */
......@@ -549,7 +664,7 @@ void
launch_lift_kernel_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
field_gpu::raw_ptrs& out, cudaStream_t stream)
{
const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
const auto THREADS_PER_BLOCK = 128;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
......@@ -566,8 +681,6 @@ launch_lift_kernel_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
checkGPU(cudaPeekAtLastError());
}
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
}
void
......@@ -601,7 +714,7 @@ gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
break;
default:
throw std::invalid_argument("compute_field_derivatives: invalid order");
throw std::invalid_argument("compute_flux_lifting_oneshot: invalid order");
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment