Select Git revision
maxwell_kernels_cuda.cu
-
Matteo Cicuttin authoredMatteo Cicuttin authored
maxwell_kernels_cuda.cu 14.82 KiB
#include "entity_data.h"
#include "maxwell_interface.h"
#include "kernels_gpu.h"
namespace maxwell {
template<bool e_field>
__global__ void
gpu_compute_jumps_kernel(const double * __restrict field,
const size_t * __restrict__ fluxdofs_mine,
const size_t * __restrict__ fluxdofs_other,
double * __restrict__ jumps,
const double * __restrict__ bcjc,
size_t base, size_t max)
{
int32_t flux_ofs = blockIdx.x * blockDim.x + threadIdx.x;
if (flux_ofs >= max)
return;
auto idx_mine = fluxdofs_mine[flux_ofs];
auto idx_neigh = fluxdofs_other[flux_ofs];
if (idx_mine == NOT_PRESENT)
return;
if (idx_neigh != NOT_PRESENT)
{
jumps[base + flux_ofs] = field[idx_mine] - field[idx_neigh];
}
else
{
double bc_coeff = bcjc[base + flux_ofs];
if constexpr(e_field)
jumps[base + flux_ofs] = bc_coeff*field[idx_mine];
else
jumps[base + flux_ofs] = (2.0 - bc_coeff)*field[idx_mine];
}
}
void
gpu_compute_jumps(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream)
{
static const size_t JUMP_THREADS = 256;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/JUMP_THREADS;
if (num_all_fluxes % JUMP_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
dim3 threads_per_block(JUMP_THREADS);
/* Compute E-field jumps */
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
/* Compute H-field jumps */
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
}
void
gpu_compute_jumps_E(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream)
{
static const size_t JUMP_THREADS = 256;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/JUMP_THREADS;
if (num_all_fluxes % JUMP_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
dim3 threads_per_block(JUMP_THREADS);
/* Compute E-field jumps */
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ex.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ex.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ey.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ey.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<true><<<gs, threads_per_block, 0, stream>>>(in.Ez.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Ez.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
}
void
gpu_compute_jumps_H(const entity_data_gpu& edg, const field_gpu& in, field_gpu& jumps,
double *bc_coeffs, cudaStream_t stream)
{
static const size_t JUMP_THREADS = 256;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/JUMP_THREADS;
if (num_all_fluxes % JUMP_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
dim3 threads_per_block(JUMP_THREADS);
/* Compute H-field jumps */
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hx.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hx.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hy.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hy.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
gpu_compute_jumps_kernel<false><<<gs, threads_per_block, 0, stream>>>(in.Hz.data(),
edg.fluxdofs_mine.data(), edg.fluxdofs_neigh.data(), jumps.Hz.data(),
bc_coeffs, edg.flux_base, num_all_fluxes);
checkGPU(cudaPeekAtLastError());
}
template<size_t K>
__global__ void
gpu_compute_fluxes_kernel_planar(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes)
{
using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
auto global_dof_pos = flux_base + local_dof_pos;
auto entry_num = local_dof_pos/KS::num_fluxes;
if (local_dof_pos >= num_fluxes)
return;
auto face_det = face_dets[entry_num];
auto nx = face_normals[3*entry_num + 0];
auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
auto ndotE = nx*jEx + ny*jEy + nz*jEz;
auto ndotH = nx*jHx + ny*jHy + nz*jHz;
auto aE = face_det * fcp.aE[global_dof_pos];
auto bE = face_det * fcp.bE[global_dof_pos];
auto aH = face_det * fcp.aH[global_dof_pos];
auto bH = face_det * fcp.bH[global_dof_pos];
flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
}
template<size_t K>
__global__ void
gpu_compute_fluxes_kernel_planar_E(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes)
{
using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
auto global_dof_pos = flux_base + local_dof_pos;
auto entry_num = local_dof_pos/KS::num_fluxes;
if (local_dof_pos >= num_fluxes)
return;
auto face_det = face_dets[entry_num];
auto nx = face_normals[3*entry_num + 0];
auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
auto ndotE = nx*jEx + ny*jEy + nz*jEz;
auto aE = face_det * fcp.aE[global_dof_pos];
auto bE = face_det * fcp.bE[global_dof_pos];
flux.Ex[global_dof_pos] = aE*(nz*jHy - ny*jHz) + bE*(ndotE*nx - jEx);
flux.Ey[global_dof_pos] = aE*(nx*jHz - nz*jHx) + bE*(ndotE*ny - jEy);
flux.Ez[global_dof_pos] = aE*(ny*jHx - nx*jHy) + bE*(ndotE*nz - jEz);
}
template<size_t K>
__global__ void
gpu_compute_fluxes_kernel_planar_H(field_gpu::const_raw_ptrs jump, field_gpu::raw_ptrs flux,
field_gpu::const_raw_ptrs bndsrc, material_params_gpu::const_raw_ptrs fcp,
const double * __restrict__ face_dets, const double * __restrict__ face_normals,
size_t flux_base, size_t num_fluxes)
{
using KS = kernel_gpu_sizes<K>;
auto local_dof_pos = blockIdx.x * blockDim.x + threadIdx.x;
auto global_dof_pos = flux_base + local_dof_pos;
auto entry_num = local_dof_pos/KS::num_fluxes;
if (local_dof_pos >= num_fluxes)
return;
auto face_det = face_dets[entry_num];
auto nx = face_normals[3*entry_num + 0];
auto ny = face_normals[3*entry_num + 1];
auto nz = face_normals[3*entry_num + 2];
auto jEx = jump.Ex[global_dof_pos] - bndsrc.Ex[global_dof_pos];
auto jEy = jump.Ey[global_dof_pos] - bndsrc.Ey[global_dof_pos];
auto jEz = jump.Ez[global_dof_pos] - bndsrc.Ez[global_dof_pos];
auto jHx = jump.Hx[global_dof_pos] + bndsrc.Hx[global_dof_pos];
auto jHy = jump.Hy[global_dof_pos] + bndsrc.Hy[global_dof_pos];
auto jHz = jump.Hz[global_dof_pos] + bndsrc.Hz[global_dof_pos];
auto ndotH = nx*jHx + ny*jHy + nz*jHz;
auto aH = face_det * fcp.aH[global_dof_pos];
auto bH = face_det * fcp.bH[global_dof_pos];
flux.Hx[global_dof_pos] = aH*(ny*jEz - nz*jEy) + bH*(ndotH*nx - jHx);
flux.Hy[global_dof_pos] = aH*(nz*jEx - nx*jEz) + bH*(ndotH*ny - jHy);
flux.Hz[global_dof_pos] = aH*(nx*jEy - ny*jEx) + bH*(ndotH*nz - jHz);
}
template<size_t K, typename Kernel>
void
launch_compute_fluxes_kernel(Kernel& kernel, const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream)
{
static const size_t FLUX_THREADS = 256;
auto num_all_fluxes = edg.num_all_elems*edg.num_fluxes*edg.num_faces_per_elem;
auto gs = num_all_fluxes/FLUX_THREADS;
if (num_all_fluxes % FLUX_THREADS != 0)
gs += 1;
dim3 grid_size(gs);
auto jp = jumps.data();
auto fp = fluxes.data();
auto pp = fcp.data();
auto bs = bndsrc.data();
kernel<<<gs, FLUX_THREADS, 0, stream>>>(jp, fp, bs, pp,
edg.face_determinants.data(), edg.normals.data(), edg.flux_base,
num_all_fluxes);
}
void
gpu_compute_fluxes(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream)
{
switch( edg.a_order )
{
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar<1>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar<2>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar<3>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar<4>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar<5>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar<6>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
default: throw std::invalid_argument("unsupported order");
}
}
void
gpu_compute_fluxes_E(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream)
{
switch( edg.a_order )
{
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_E<1>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_E<2>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_E<3>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_E<4>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_E<5>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_E<6>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
default: throw std::invalid_argument("unsupported order");
}
}
void
gpu_compute_fluxes_H(const entity_data_gpu& edg, const field_gpu& jumps,
field_gpu& fluxes, const field_gpu& bndsrc, const material_params_gpu& fcp,
cudaStream_t stream)
{
switch( edg.a_order )
{
case 1: launch_compute_fluxes_kernel<1>(gpu_compute_fluxes_kernel_planar_H<1>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 2: launch_compute_fluxes_kernel<2>(gpu_compute_fluxes_kernel_planar_H<2>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 3: launch_compute_fluxes_kernel<3>(gpu_compute_fluxes_kernel_planar_H<3>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 4: launch_compute_fluxes_kernel<4>(gpu_compute_fluxes_kernel_planar_H<4>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 5: launch_compute_fluxes_kernel<5>(gpu_compute_fluxes_kernel_planar_H<5>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
case 6: launch_compute_fluxes_kernel<6>(gpu_compute_fluxes_kernel_planar_H<6>,
edg, jumps, fluxes, bndsrc, fcp, stream);
break;
default: throw std::invalid_argument("unsupported order");
}
}
} // namespace maxwell