Skip to content
Snippets Groups Projects
Select Git revision
  • ba874e275b4dfd04c8d14d9fb5a1c2b4548893ed
  • master default protected
  • clem_dev
  • clem_dev_corrected
  • kokkos
  • devel
  • bcast_debug
  • MC/high_order_geometry
  • MC/mpi_nonblocking
  • MC/multigpu
  • MC/lifting_oneshot
  • MC/physent
  • curls_marco
  • MC/gefpg_lua_binding
  • emdant/dg-cherry-pick-8f1f09f5
  • v0.3.0
  • v0.2.0
  • v0.1.0
18 results

maxwell_kernels_cuda.cu

Blame
  • maxwell_kernels_cuda.cu 14.82 KiB
    #include "entity_data.h"
    #include "maxwell_interface.h"
    #include "kernels_gpu.h"
    
    namespace maxwell {
    
    template<bool e_field>
    __global__ void
    gpu_compute_jumps_kernel(const double * __restrict field,
        const size_t * __restrict__ fluxdofs_mine,
        const size_t * __restrict__ fluxdofs_other,
        double * __restrict__ jumps,
        const double * __restrict__ bcjc,
        size_t base, size_t max)
    {
        int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
    
        if (flux_ofs >= max)
            return;
    
        auto idx_mine = fluxdofs_mine[flux_ofs];
        auto idx_neigh = fluxdofs_other[flux_ofs];
    
        if (idx_mine == NOT_PRESENT)
            return; 
    
        if (idx_neigh != NOT_PRESENT)
        {
            jumps[base + flux_ofs] = field[idx_mine] - field[idx_neigh];
        }
        else
        {
            double bc_coeff = bcjc[base + flux_ofs];
            if constexpr(e_field)
                jumps[base + flux_ofs] = bc_coeff*field[idx_mine];
            else
                jumps[base + flux_ofs] = (2.0 - bc_coeff)*field[idx_mine];
        }
    }
    
    void
    gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
        double *bc_coeffs, cudaStream_t stream)
    {
        static const size_t JUMP_THREADS = 256;
    
        auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
    
        auto gs = num_all_fluxes/JUMP_THREADS;
        if (num_all_fluxes % JUMP_THREADS != 0)
            gs += 1;
    
        dim3 grid_size(gs);
        dim3 threads_per_block(JUMP_THREADS);
    
        /* Compute E-field jumps */
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
        
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    
        /* Compute H-field jumps */
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
        
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    }
    
    void
    gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
        double *bc_coeffs, cudaStream_t stream)
    {
        static const size_t JUMP_THREADS = 256;
    
        auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
    
        auto gs = num_all_fluxes/JUMP_THREADS;
        if (num_all_fluxes % JUMP_THREADS != 0)
            gs += 1;
    
        dim3 grid_size(gs);
        dim3 threads_per_block(JUMP_THREADS);
    
        /* Compute E-field jumps */
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
        
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    
        gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    }
    
    void
    gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
        double *bc_coeffs, cudaStream_t stream)
    {
        static const size_t JUMP_THREADS = 256;
    
        auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;    
    
        auto gs = num_all_fluxes/JUMP_THREADS;
        if (num_all_fluxes % JUMP_THREADS != 0)
            gs += 1;
    
        dim3 grid_size(gs);
        dim3 threads_per_block(JUMP_THREADS);
    
        /* Compute H-field jumps */
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
        
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    
        gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(),
            edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(),
            bc_coeffs, edg.flux_base, num_all_fluxes);
        checkGPU(cudaPeekAtLastError());
    }
    
    template<size_t K>
    __global__ void
    gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
        field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
        const double * __restrict__ face_dets, const double * __restrict__ face_normals,
        size_t flux_base, size_t num_fluxes)
    {
        using KS = kernel_gpu_sizes<K>;
        auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
        auto global_dof_pos = flux_base + local_dof_pos;
    
        auto entry_num = local_dof_pos/KS::num_fluxes;
    
        if (local_dof_pos >= num_fluxes)
            return;
    
        auto face_det = face_dets[entry_num];
        auto nx = face_normals[3*entry_num + 0];
        auto ny = face_normals[3*entry_num + 1];
        auto nz = face_normals[3*entry_num + 2];
    
        auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
        auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
        auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
        auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
        auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
        auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
    
        auto ndotE = nx*jEx + ny*jEy + nz*jEz;
        auto ndotH = nx*jHx + ny*jHy + nz*jHz;
        
        auto aE = face_det * fcp.aE[global_dof_pos];
        auto bE = face_det * fcp.bE[global_dof_pos];
        auto aH = face_det * fcp.aH[global_dof_pos];
        auto bH = face_det * fcp.bH[global_dof_pos];
     
        flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
        flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
        flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
        flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
        flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
        flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
    }
    
    template<size_t K>
    __global__ void
    gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
        field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
        const double * __restrict__ face_dets, const double * __restrict__ face_normals,
        size_t flux_base, size_t num_fluxes)
    {
        using KS = kernel_gpu_sizes<K>;
        auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
        auto global_dof_pos = flux_base + local_dof_pos;
    
        auto entry_num = local_dof_pos/KS::num_fluxes;
    
        if (local_dof_pos >= num_fluxes)
            return;
    
        auto face_det = face_dets[entry_num];
        auto nx = face_normals[3*entry_num + 0];
        auto ny = face_normals[3*entry_num + 1];
        auto nz = face_normals[3*entry_num + 2];
    
        auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
        auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
        auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
        auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
        auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
        auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
    
        auto ndotE = nx*jEx + ny*jEy + nz*jEz;
    
        auto aE = face_det * fcp.aE[global_dof_pos];
        auto bE = face_det * fcp.bE[global_dof_pos];
     
        flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
        flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
        flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
    }
    
    template<size_t K>
    __global__ void
    gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
        field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
        const double * __restrict__ face_dets, const double * __restrict__ face_normals,
        size_t flux_base, size_t num_fluxes)
    {
        using KS = kernel_gpu_sizes<K>;
        auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
        auto global_dof_pos = flux_base + local_dof_pos;
    
        auto entry_num = local_dof_pos/KS::num_fluxes;
    
        if (local_dof_pos >= num_fluxes)
            return;
    
        auto face_det = face_dets[entry_num];
        auto nx = face_normals[3*entry_num + 0];
        auto ny = face_normals[3*entry_num + 1];
        auto nz = face_normals[3*entry_num + 2];
    
        auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
        auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
        auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
        auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
        auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
        auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
    
        auto ndotH = nx*jHx + ny*jHy + nz*jHz;
        
        auto aH = face_det * fcp.aH[global_dof_pos];
        auto bH = face_det * fcp.bH[global_dof_pos];
     
        flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
        flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
        flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
    }
    
    template<size_t K, typename Kernel>
    void
    launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
        field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
        cudaStream_t stream)
    {
        static const size_t FLUX_THREADS = 256;
    
        auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
    
        auto gs = num_all_fluxes/FLUX_THREADS;
        if (num_all_fluxes % FLUX_THREADS != 0)
            gs += 1;
    
        dim3 grid_size(gs);
        auto jp = jumps.data();
        auto fp = fluxes.data();
        auto pp = fcp.data();
        auto bs = bndsrc.data();
    
        kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
            edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
            num_all_fluxes);
    }
    
    void
    gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
        field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
        cudaStream_t stream)
    {
        switch( edg.a_order )
        {
            case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            default: throw std::invalid_argument("unsupported order");
        }
    }
    
    void
    gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
        field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
        cudaStream_t stream)
    {
        switch( edg.a_order )
        {
            case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            default: throw std::invalid_argument("unsupported order");
        }
    }
    
    void
    gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
        field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
        cudaStream_t stream)
    {
        switch( edg.a_order )
        {
            case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
            
            case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>,
                edg, jumps, fluxes, bndsrc, fcp, stream);
                break;
    
            default: throw std::invalid_argument("unsupported order");
        }
    }
    
    } // namespace maxwell