Implemented jumps oneshot leapfrog

0227adb3 · Marco D'Antonio · ab41840f · 0227adb3 · 0227adb3 · 0227adb3
Commit 0227adb3 authored 2 years ago by Marco D'Antonio
--- a/include/maxwell/maxwell_interface.h
+++ b/include/maxwell/maxwell_interface.h
@@ -474,6 +474,12 @@ void gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs&
 void gpu_compute_jumps_oneshot(const entity_data_gpu& edg, const field_gpu& in,
    field_gpu& jumps, double *bc_coeffs, gpuStream_t stream);

+void gpu_compute_jumps_oneshot_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, gpuStream_t stream);
+
+void gpu_compute_jumps_oneshot_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, gpuStream_t stream);
+
 void gpu_jumps_to_ipc_E(solver_state_gpu&, size_t, size_t);
 void gpu_ipc_to_jumps_E(solver_state_gpu&, size_t, size_t);
 void gpu_jumps_swap_ipc_E(solver_state_gpu&, size_t, size_t);

--- a/src/libgmshdg/kernels_cuda.cu
+++ b/src/libgmshdg/kernels_cuda.cu
@@ -9,6 +9,9 @@
 #include "libgmshdg/entity_data.h"
 #include "libgmshdg/kernels_gpu.h"

+
+// Flops count: (54*num_dofs+6)*num_all_dofs
+// Mem accesses: (24*num_dofs+3)*num_all_dofs
 template<size_t K>
 __global__ void
 gpu_curl_planar(const double *fx, const double *fy, const double *fz,
@@ -78,6 +81,8 @@ gpu_curl_planar(const double *fx, const double *fy, const double *fz,
    curl_fz[output_dof_offset] = alpha * (acc_dfy_dx - acc_dfx_dy);
 }

+// Flops count: (54*num_dofs+9)*num_all_dofs
+// Mem accesses: (24*num_dofs+6)*num_all_dofs
 template<size_t K>
 __global__ void
 gpu_curl_planar(const double *fx, const double *fy, const double *fz,
@@ -151,6 +156,8 @@ gpu_curl_planar(const double *fx, const double *fy, const double *fz,
 }

 /* compute α∇F */
+// Flops count: (21*num_dofs+3)*num_all_dofs
+// Mem accesses: (15*num_dofs+3)
 template<size_t K>
 __global__ void
 gpu_deriv_planar(const double *F, const double * __restrict__ J,
@@ -227,8 +234,6 @@ launch_curl_kernel(const entity_data_gpu& edg,
    if (edg.g_order == 1)
        gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
            J, Dtex, alpha, curl_fx, curl_fy, curl_fz, num_elems, orients, edg.dof_base);
-    //else
-    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }

 template<size_t K>
@@ -255,8 +260,6 @@ launch_curl_kernel(const entity_data_gpu& edg,
        gpu_curl_planar<K><<<num_blocks, KS::curl_threads, 0, stream>>>(fx, fy, fz,
            J, Dtex, alpha, curl_fx, curl_fy, curl_fz, src_x, src_y, src_z, 
            num_elems, orients, edg.dof_base);
-    //else
-    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }

 template<size_t K>
@@ -280,8 +283,6 @@ launch_deriv_kernel(const entity_data_gpu& edg,
    if (edg.g_order == 1)
        gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads, 0, stream>>>(f, J,
            Dtex, df_dx, df_dy, df_dz, alpha, num_elems, orients, edg.dof_base);
-    //else
-    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }

 void
@@ -458,8 +459,6 @@ launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out, cudaStrea
    if (edg.g_order == 1)
        gpu_lift_planar<K><<<num_blocks, THREADS_PER_BLOCK, 0, stream>>>(f, Ltex,
            dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
-    //else
-    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }

 void
@@ -493,10 +492,12 @@ gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out,
            break;

        default:
-            throw std::invalid_argument("compute_field_derivatives: invalid order");
+            throw std::invalid_argument("compute_flux_lifting: invalid order");
    }
 }

+// Flops count: 1*num_all_dofs
+// Mem accesse: 3*num_all_dofs
 void __global__
 gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add,
    const double * __restrict__ sub, size_t num_dofs)
@@ -518,6 +519,8 @@ gpu_curl(double *dst, const double *add, const double *sub, size_t num_dofs,
    gpu_curl_kernel<<<gs, SUBTRACT_THREADS, 0, stream>>>(dst, add, sub, num_dofs);
 }

+// Flops count: 2*num_all_dofs
+// Mem accesse: 4*num_all_dofs
 void __global__
 gpu_curl_kernel(double * __restrict__ dst, const double * __restrict__ add,
    const double * __restrict__ sub, const double * __restrict__ source,

--- a/src/maxwell/maxwell_gpu.cpp
+++ b/src/maxwell/maxwell_gpu.cpp
@@ -218,36 +218,30 @@ void init_profiling(const model& mod, solver_state_gpu& state, time_integrator_t
        << "num_field_dofs: " << num_field_dofs << std::endl
        << "num_field_fluxes: " << num_field_fluxes << std::endl;

-    auto jacobians_read = 3*3*num_cells;
-    auto dm_read = 3*cell_dofs*cell_dofs;
-    auto curl_readwrite = 6*num_field_dofs;
-    auto sources_application = 3*num_field_dofs;
    if (ti == time_integrator_type::LEAPFROG)
    {
 #ifdef TM_NEW_CURLS
-        state.logger.register_profiler("compute_curls_H", profiler::type::GPU);
-        state.logger.register_profiler("compute_curls_E", profiler::type::GPU);
+        state.logger.register_profiler("compute_curls_H", profiler::type::GPU)
+            .set_quantity("GFlops", (54*cell_dofs+9)*num_field_dofs/1e9)
+            .set_quantity("GB", (24*cell_dofs+6)*num_field_dofs*8/1e9);
+        state.logger.register_profiler("compute_curls_E", profiler::type::GPU)
+            .set_quantity("GFlops", (54*cell_dofs+6)*num_field_dofs/1e9)
+            .set_quantity("GB", (24*cell_dofs+3)*num_field_dofs*8/1e9);

 #else
-        auto derivatives = jacobians_read+dm_read+4*num_field_dofs;
-        auto curls_h = 4*num_field_dofs;
-        auto curls_e = 3*num_field_dofs;
-        state.logger.register_profiler("compute_curls_H", profiler::type::GPU);
-        state.logger.register_profiler("compute_curls_E", profiler::type::GPU);
+
+        state.logger.register_profiler("compute_curls_H", profiler::type::GPU)
+            .set_quantity("GFlops", ((21*cell_dofs+3)*num_field_dofs+2*num_field_dofs)*3/1e9)
+            .set_quantity("GB", ((15*cell_dofs+3)*num_field_dofs+4*num_field_dofs)*3*8/1e9);
+        state.logger.register_profiler("compute_curls_E", profiler::type::GPU)
+            .set_quantity("GFlops", ((21*cell_dofs+3)*num_field_dofs+num_field_dofs)*3/1e9)
+            .set_quantity("GB", ((15*cell_dofs+3)*num_field_dofs+3*num_field_dofs)*3*8/1e9);
 #endif /* TM_NEW_CURLS */

        state.logger.register_profiler("compute_field_jumps", profiler::type::GPU);
 
-        auto normals_read = num_all_faces;
-        auto dets_read = num_all_faces;
-        auto sources_imposition = 18*num_field_fluxes;
-        auto matparams_readwrite = 4*num_field_fluxes;
-        auto fluxes_write = 3*num_field_fluxes;
        state.logger.register_profiler("compute_field_fluxes", profiler::type::GPU);
        
-        dets_read = num_cells;
-        auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
-        auto lifting_readwrite = num_field_fluxes+num_field_dofs;
        state.logger.register_profiler("compute_flux_lifting", profiler::type::GPU);
    }
    else
@@ -256,24 +250,13 @@ void init_profiling(const model& mod, solver_state_gpu& state, time_integrator_t

        state.logger.register_profiler("compute_curls", profiler::type::GPU);
 #else
-        auto derivatives = jacobians_read+dm_read+4*num_field_dofs;
-        auto curls = 4*num_field_dofs + 3*num_field_dofs;
        state.logger.register_profiler("compute_curls", profiler::type::GPU);
 #endif /* TM_NEW_CURLS */
        
        state.logger.register_profiler("compute_field_jumps", profiler::type::GPU);
 
-        auto normals_read = num_all_faces;
-        auto dets_read = num_all_faces;
-        auto sources_imposition = 18*num_field_fluxes;
-        auto matparams_readwrite = 4*num_field_fluxes;
-        auto fluxes_write = 3*num_field_fluxes;
        state.logger.register_profiler("compute_field_fluxes", profiler::type::GPU);

-
-        dets_read = num_cells;
-        auto lm_read = num_all_orientations*cell_dofs*4*cell_fluxes;
-        auto lifting_readwrite = num_field_fluxes+num_field_dofs;
        state.logger.register_profiler("compute_flux_lifting", profiler::type::GPU);
    }
 }
@@ -603,8 +586,17 @@ compute_fluxes_E(const model& mod, solver_state_gpu& state,
    auto& jumps_profiler = state.logger.get_profiler("compute_field_jumps");
 #endif /* ENABLE_PROFILING */
    PROFILER_TIC(jumps_profiler);
+
+
    for (const auto& edg : state.edgs)
+    {
+#ifdef TM_ONESHOT_JUMPS
+        maxwell::gpu_compute_jumps_oneshot_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+#else
        maxwell::gpu_compute_jumps_E(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+#endif
+    }
+
    PROFILER_TOC(jumps_profiler);

 #ifdef USE_MPI
@@ -688,8 +680,16 @@ compute_fluxes_H(const model& mod, solver_state_gpu& state,
    auto& jumps_profiler = state.logger.get_profiler("compute_field_jumps");
 #endif /* ENABLE_PROFILING */
    PROFILER_TIC(jumps_profiler);
+
    for (const auto& edg : state.edgs)
+    {
+#ifdef TM_ONESHOT_JUMPS
+        maxwell::gpu_compute_jumps_oneshot_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+#else
        maxwell::gpu_compute_jumps_H(edg, in, state.jumps, bc_coeffs, state.compute_stream);
+#endif
+    }
+
    PROFILER_TOC(jumps_profiler);

 #ifdef USE_MPI

--- a/src/maxwell/maxwell_kernels_cuda.cu
+++ b/src/maxwell/maxwell_kernels_cuda.cu
@@ -94,6 +94,8 @@ gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& ju
    checkGPU(cudaPeekAtLastError());
 }

+// Flops count: 6*num_all_dofs
+// Mem accesses: 18*num_all_dofs
 __global__ void
 gpu_compute_jumps_kernel_oneshot(field_gpu::const_raw_ptrs field,
    const size_t * __restrict__ fluxdofs_mine,
@@ -221,6 +223,125 @@ gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu&
    checkGPU(cudaPeekAtLastError());
 }

+// Flops count: 3*num_all_dofs
+// Mem accesses: 9*num_all_dofs
+__global__ void
+gpu_compute_jumps_kernel_oneshot_H(field_gpu::const_raw_ptrs field,
+    const size_t * __restrict__ fluxdofs_mine,
+    const size_t * __restrict__ fluxdofs_other,
+    field_gpu::raw_ptrs jumps,
+    const double * __restrict__ bcjc,
+    size_t base, size_t max)
+{
+    int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (flux_ofs >= max)
+        return;
+
+    auto idx_mine = fluxdofs_mine[flux_ofs];
+    auto idx_neigh = fluxdofs_other[flux_ofs];
+
+    if (idx_mine == NOT_PRESENT)
+        return; 
+
+    if (idx_neigh != NOT_PRESENT)
+    {
+        jumps.Hx[base + flux_ofs] = field.Hx[idx_mine] - field.Hx[idx_neigh];
+        jumps.Hy[base + flux_ofs] = field.Hy[idx_mine] - field.Hy[idx_neigh];
+        jumps.Hz[base + flux_ofs] = field.Hz[idx_mine] - field.Hz[idx_neigh];
+    }
+    else
+    {
+        double bc_coeff = bcjc[base + flux_ofs];
+        
+        jumps.Hx[base + flux_ofs] = (2.0 - bc_coeff)*field.Hx[idx_mine];
+        jumps.Hy[base + flux_ofs] = (2.0 - bc_coeff)*field.Hy[idx_mine];
+        jumps.Hz[base + flux_ofs] = (2.0 - bc_coeff)*field.Hz[idx_mine];
+    }
+}
+
+// Flops count: 3*num_all_dofs
+// Mem accesses: 9*num_all_dofs
+__global__ void
+gpu_compute_jumps_kernel_oneshot_E(field_gpu::const_raw_ptrs field,
+    const size_t * __restrict__ fluxdofs_mine,
+    const size_t * __restrict__ fluxdofs_other,
+    field_gpu::raw_ptrs jumps,
+    const double * __restrict__ bcjc,
+    size_t base, size_t max)
+{
+    int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (flux_ofs >= max)
+        return;
+
+    auto idx_mine = fluxdofs_mine[flux_ofs];
+    auto idx_neigh = fluxdofs_other[flux_ofs];
+
+    if (idx_mine == NOT_PRESENT)
+        return; 
+
+    if (idx_neigh != NOT_PRESENT)
+    {
+        jumps.Ex[base + flux_ofs] = field.Ex[idx_mine] - field.Ex[idx_neigh];
+        jumps.Ey[base + flux_ofs] = field.Ey[idx_mine] - field.Ey[idx_neigh];
+        jumps.Ez[base + flux_ofs] = field.Ez[idx_mine] - field.Ez[idx_neigh];
+    }
+    else
+    {
+        double bc_coeff = bcjc[base + flux_ofs];
+        
+        jumps.Ex[base + flux_ofs] = bc_coeff*field.Ex[idx_mine];
+        jumps.Ey[base + flux_ofs] = bc_coeff*field.Ey[idx_mine];
+        jumps.Ez[base + flux_ofs] = bc_coeff*field.Ez[idx_mine];
+    }
+}
+
+void
+gpu_compute_jumps_oneshot_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, gpuStream_t stream)
+{
+    static const size_t JUMP_THREADS = 128;
+
+    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
+
+    auto gs = num_all_fluxes/JUMP_THREADS;
+    if (num_all_fluxes % JUMP_THREADS != 0)
+        gs += 1;
+
+    dim3 grid_size(gs);
+    dim3 threads_per_block(JUMP_THREADS);
+
+    /* Compute H-field jumps */
+    gpu_compute_jumps_kernel_oneshot_H<<<gs, threads_per_block, 0, stream>>>(in.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+}
+
+void
+gpu_compute_jumps_oneshot_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
+    double *bc_coeffs, gpuStream_t stream)
+{
+    static const size_t JUMP_THREADS = 128;
+
+    auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
+
+    auto gs = num_all_fluxes/JUMP_THREADS;
+    if (num_all_fluxes % JUMP_THREADS != 0)
+        gs += 1;
+
+    dim3 grid_size(gs);
+    dim3 threads_per_block(JUMP_THREADS);
+
+    /* Compute E-field jumps */
+    gpu_compute_jumps_kernel_oneshot_E<<<gs, threads_per_block, 0, stream>>>(in.data(),
+        edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.data(),
+        bc_coeffs, edg.flux_base, num_all_fluxes);
+    checkGPU(cudaPeekAtLastError());
+}
+
+
 template<size_t K, bool do_E, bool do_H>
 __global__ void
 gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
@@ -492,12 +613,6 @@ gpu_lift_planar_oneshot(field_gpu::raw_ptrs flux, gpuTextureObject_t LM_tex,
    const double * __restrict__ dets, field_gpu::raw_ptrs lifted_flux,
    int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
 {
-    /* This kernel saturates the texture cache bandwidth (~1 TB/s on K20x!!)
-     * and thus it does not achieve good performance. This happens because
-     * the lifting matrix entries are not reused sufficiently. Sufficient
-     * reuse should be achieved by using a single entry to multiply different
-     * rhs instead of only one at a time. Orientations however make the
-     * implementation complicated, so for now the kernel remains slow. */
    using KS = kernel_gpu_sizes<K>;

    /* One thread per *output* dof */
@@ -549,7 +664,7 @@ void
 launch_lift_kernel_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
    field_gpu::raw_ptrs& out, cudaStream_t stream)
 {
-    const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
+    const auto THREADS_PER_BLOCK = 128;
    auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
    if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
        num_blocks += 1;
@@ -566,8 +681,6 @@ launch_lift_kernel_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
            dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
        checkGPU(cudaPeekAtLastError());
    }
-    //else
-    //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
 }

 void
@@ -601,7 +714,7 @@ gpu_compute_flux_lifting_oneshot(entity_data_gpu& edg, field_gpu::raw_ptrs& f,
            break;

        default:
-            throw std::invalid_argument("compute_field_derivatives: invalid order");
+            throw std::invalid_argument("compute_flux_lifting_oneshot: invalid order");
    }
 }