Skip to content
Snippets Groups Projects
Commit 723ff703 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Started rewriting cuda kernels.

parent c9a1ce9e
No related branches found
No related tags found
No related merge requests found
......@@ -66,7 +66,6 @@ link_directories(${GMSH_LIB})
######################################################################
## Use or not use GPU solvers
option(OPT_ENABLE_GPU_SOLVER "Enable GPU solvers" OFF)
if (OPT_ENABLE_GPU_SOLVER)
add_definitions(-DENABLE_GPU_SOLVER)
......@@ -117,9 +116,14 @@ if (OPT_ENABLE_GPU_SOLVER)
add_definitions(-D__HIP_PLATFORM_HCC__)
endif()
endif()
endif()
if (OPT_ENABLE_GPU_SOLVER)
option(OPT_DEBUG_GPU "Enable GPU debugging" OFF)
if (OPT_DEBUG_GPU)
add_definitions(-DGPU_DEBUG)
endif()
endif()
######################################################################
## Compiler identification stuff
......@@ -235,12 +239,15 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
set(COMMON_SOURCES gmsh_io.cpp reference_element.cpp physical_element.cpp entity.cpp kernels_cpu.cpp connectivity.cpp entity_data.cpp)
set(LIBGMSHDG_SOURCES src/gmsh_io.cpp
src/entity.cpp
src/reference_element.cpp
src/physical_element.cpp
src/entity.cpp
src/kernels_cpu.cpp
src/connectivity.cpp
src/entity_data.cpp)
src/entity_data.cpp
src/kernels_cpu.cpp
src/kernels_gpu_mi.cpp
src/kernels_cuda.cu
)
add_library(gmshdg SHARED ${LIBGMSHDG_SOURCES})
target_link_libraries(gmshdg ${LINK_LIBS})
......
......@@ -14,3 +14,4 @@ using MatxCM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>
template<typename T>
using MatxRM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
#pragma once
#include <numeric>
#include "eigen.h"
#include "types.h"
......@@ -57,13 +59,20 @@ struct entity_data_cpu
vecxd face_determinants;
};
inline size_t
num_elems_all_orientations(const entity_data_cpu& ed)
{
assert(ed.num_elems.size() == ed.num_orientations);
return std::accumulate(ed.num_elems.begin(), ed.num_elems.end(), 0);
}
#ifdef ENABLE_GPU_SOLVER
struct entity_data_gpu
{
int g_order;
int a_order;
size_t* num_elems;
size_t num_orientations;
size_t num_all_elems;
size_t * element_orientation;
size_t num_bf;
size_t dof_base;
......@@ -79,13 +88,12 @@ struct entity_data_gpu
~entity_data_gpu();
};
template<typename T>
void
gpu_copyin(const std::vector<T> src, T** dst)
{
checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
}
template<typename T>
......@@ -93,7 +101,7 @@ void
gpu_copyin(const MatxCM<T>& src, T** dst)
{
checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
}
template<typename T>
......@@ -101,7 +109,7 @@ void
gpu_copyin(const MatxRM<T>& src, T** dst)
{
checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
}
template<typename T>
......@@ -109,7 +117,9 @@ void
gpu_copyin(const Eigen::Matrix<T, Eigen::Dynamic, 1>& src, double** dst)
{
checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
}
#endif /* ENABLE_GPU_SOLVER */
......@@ -168,10 +168,10 @@ public:
device_vector(T *src, size_t size)
: d_vptr(nullptr), vsize(0)
{
init(src, size);
copyin(src, size);
}
void init(T *src, size_t size)
void copyin(T *src, size_t size)
{
if (d_vptr != nullptr)
(void) gpuFree(d_vptr);
......@@ -181,6 +181,14 @@ public:
vsize = size;
}
void copyout(T *dst)
{
if (d_vptr == nullptr)
return;
(void)checkGPU( gpuMemcpy(dst, d_vptr, vsize*sizeof(T), gpuMemcpyDeviceToHost) );
}
void resize(size_t size)
{
if (d_vptr != nullptr)
......@@ -353,7 +361,7 @@ public:
float elapsed;
gpuEventSynchronize(stop);
gpuEventElapsedTime(&elapsed, start, stop);
return elapsed/1000.0;
return double(elapsed)/1000.0;
}
};
......@@ -2,9 +2,12 @@
#include "entity_data.h"
template<size_t K>
struct kernel_gpu_sizes;
/* Compile-time constants configuring sizes for the kernels */
/* Compile-time constants configuring sizes for the kernels.
* Elements are organized in blocks of cells_per_dblock elements each.
* Each dblock contains dblock_size dofs, where dblock_size must be a
* multiple of the warp size. This means that some dofs in the dblock
* will be padding. The effective dofs used are dblock_bf.*/
template<size_t K>
struct kernel_gpu_sizes;
......@@ -13,7 +16,7 @@ struct kernel_gpu_sizes<1>
{
static const size_t num_bf = 4;
static const size_t cells_per_dblock = 32;
static const size_t dblock_bf = num_bf * vols_per_ublock;
static const size_t dblock_bf = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
......@@ -23,7 +26,17 @@ struct kernel_gpu_sizes<2>
{
static const size_t num_bf = 10;
static const size_t cells_per_dblock = 12;
static const size_t dblock_bf = num_bf * vols_per_ublock;
static const size_t dblock_bf = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 1;
};
template<>
struct kernel_gpu_sizes<3>
{
static const size_t num_bf = 20;
static const size_t cells_per_dblock = 12;
static const size_t dblock_bf = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 1;
};
......@@ -38,8 +51,7 @@ struct kernel_gpu_sizes_runtime
};
template<size_t N>
kernel_gpu_sizes_runtime
get_kernel_gpu_sizes()
kernel_gpu_sizes_runtime get_kernel_gpu_sizes()
{
kernel_gpu_sizes_runtime ret;
......@@ -47,13 +59,12 @@ get_kernel_gpu_sizes()
ret.cells_per_dblock = kernel_gpu_sizes<N>::cells_per_dblock;
ret.dblock_bf = kernel_gpu_sizes<N>::dblock_bf;
ret.dblock_size = kernel_gpu_sizes<N>::dblock_size;
ret.parallel_dblocks = kernel_gpu_sizes<N>::slice_size;
ret.parallel_dblocks = kernel_gpu_sizes<N>::parallel_dblocks;
return ret;
};
inline
kernel_gpu_sizes_runtime
inline kernel_gpu_sizes_runtime
get_kernel_gpu_sizes(int approx_order)
{
switch (approx_order)
......@@ -65,49 +76,15 @@ get_kernel_gpu_sizes(int approx_order)
throw std::invalid_argument("Can't retrieve info for this element");
}
/* Return the number of dofs that this entity will take on GPU,
* including all the padding dofs */
size_t gpu_dofs(const entity_data_cpu& ed)
{
auto ks = get_kernel_gpu_sizes(ed.a_order);
size_t num_dblocks = tot_elems / ks.cells_per_dblock;
if (num_dblocks % ks.cells_per_dblock)
num_dblocks += 1;
return num_dblocks * ks.dblock_size;
}
size_t gpu_dblocks(const entity_data_cpu&);
size_t gpu_dblocks_dofs(const entity_data_cpu&);
#define TO_GPU true
#define FROM_GPU false
void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, vecxd&, bool);
void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
const vecxd& in_field, vecxd& out_field, bool to_gpu)
{
auto ks = get_kernel_gpu_sizes(ed.a_order);
size_t num_dblocks = tot_elems / ks.cells_per_dblock;
if (num_dblocks % ks.cells_per_dblock)
num_dblocks += 1;
for (size_t i = 0; i < num_dblocks; i++)
{
for (size_t j = 0; j < ks.cells_per_dblock; j++)
{
auto cur_elem = ks.cells_per_dblock*i + j;
if (cur_elem >= tot_elems)
break;
auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
if (to_gpu)
for (size_t k = 0; k < ks.num_bf; j++)
out_field(gpu_base + k) = in_field(cpu_base + k);
else
for (size_t k = 0; k < ks.num_bf; j++)
out_field(cpu_base + k) = in_field(gpu_base + k);
}
}
}
void
gpu_compute_field_derivatives(entity_data_gpu& edg,
const double* F, double *dF_dx, double* dF_dy, double* dF_dz);
......@@ -2,7 +2,7 @@
entity_data_gpu::entity_data_gpu()
{
num_elems = nullptr;
element_orientation = nullptr;
differentiation_matrices = nullptr;
jacobians = nullptr;
cell_determinants = nullptr;
......@@ -13,8 +13,18 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
g_order = ed.g_order;
a_order = ed.a_order;
gpu_copyin(ed.num_elems, &num_elems);
num_orientations = ed.num_orientations;
num_all_elems = num_elems_all_orientations(ed);
std::vector<size_t> eo;
eo.reserve(num_all_elems);
for (size_t iO = 0; iO < ed.num_orientations; iO++)
{
for (size_t iT = 0; iT < ed.num_elems[iO]; iT++)
eo.push_back(iO);
}
assert(eo.size() == num_all_elems);
gpu_copyin(eo, &element_orientation);
num_bf = ed.num_bf;
dof_base = 0;
......@@ -35,8 +45,9 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
entity_data_gpu::~entity_data_gpu()
{
if (num_elems)
checkGPU( gpuFree(num_elems) );
/*
if (element_orientation)
checkGPU( gpuFree(element_orientation) );
if (differentiation_matrices)
checkGPU( gpuFree(differentiation_matrices) );
......@@ -46,5 +57,6 @@ entity_data_gpu::~entity_data_gpu()
if (cell_determinants)
checkGPU( gpuFree(cell_determinants) );
*/
}
#include "entity_data.h"
#include "kernels_gpu.h"
template<size_t K>
__global__ void
gpu_compute_field_derivatives_kernel_planar(entity_data_gpu edg,
const double* F, double *dF_dx, double* dF_dy, double* dF_dz)
{
using KS = kernel_gpu_sizes<K>;
int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
int32_t cur_dof_offset = edg.dof_base + ofs_in_entity;
if (cur_dof_offset >= edg.num_all_elems*edg.num_bf)
return;
int32_t iT = cur_dof_offset / KS::num_bf;
int32_t elem_dof_base = edg.dof_base + iT*edg.num_bf;
int32_t iO = edg.element_orientation[iT];
int32_t DM_row = ofs_in_entity % KS::num_bf;
int32_t DM_orient = 3*KS::num_bf*iO*KS::num_bf;
double *DM = edg.differentiation_matrices + DM_orient;
double accm_dF_dx = 0.0;
double accm_dF_dy = 0.0;
double accm_dF_dz = 0.0;
int32_t jac_ofs = 9*iT;
double J[9];
for (int32_t i = 0; i < 9; i++)
J[i] = edg.jacobians[jac_ofs+i];
for (int32_t dof = 0; dof < KS::num_bf; dof++)
{
int32_t d_ofs = DM_row + KS::num_bf*dof;
int32_t f_ofs = elem_dof_base + dof;
double v = DM[d_ofs] * F[f_ofs];
accm_dF_dx += J[0] * v;
accm_dF_dy += J[3] * v;
accm_dF_dz += J[6] * v;
}
for (int32_t dof = 0; dof < KS::num_bf; dof++)
{
int32_t d_ofs = DM_row + KS::num_bf*KS::num_bf + KS::num_bf*dof;
int32_t f_ofs = elem_dof_base + dof;
double v = DM[d_ofs] * F[f_ofs];
accm_dF_dx += J[1] * v;
accm_dF_dy += J[4] * v;
accm_dF_dz += J[7] * v;
}
for (int32_t dof = 0; dof < KS::num_bf; dof++)
{
int32_t d_ofs = DM_row + 2*KS::num_bf*KS::num_bf + KS::num_bf*dof;
int32_t f_ofs = elem_dof_base + dof;
double v = DM[d_ofs] * F[f_ofs];
accm_dF_dx += J[2] * v;
accm_dF_dy += J[5] * v;
accm_dF_dz += J[8] * v;
}
dF_dx[cur_dof_offset] = accm_dF_dx;
dF_dy[cur_dof_offset] = accm_dF_dy;
dF_dz[cur_dof_offset] = accm_dF_dz;
}
void
gpu_compute_field_derivatives(entity_data_gpu& edg,
const double* f, double *df_dx, double* df_dy, double* df_dz)
{
const auto THREADS_PER_BLOCK = 256;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
switch (edg.a_order)
{
case 1:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
break;
case 2:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<2>(ed, f, df_dx, df_dy, df_dz);
break;
case 3:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<3>(ed, f, df_dx, df_dy, df_dz);
break;
#if 0
case 4:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<4>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<4>(ed, f, df_dx, df_dy, df_dz);
break;
case 5:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<5>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<5>(ed, f, df_dx, df_dy, df_dz);
break;
case 6:
if (edg.g_order == 1)
gpu_compute_field_derivatives_kernel_planar<6>(edg, f, df_dx, df_dy, df_dz);
//else
// compute_field_derivatives_kernel_curved<6>(ed, f, df_dx, df_dy, df_dz);
break;
#endif
default:
std::cout << "compute_field_derivatives: invalid order" << std::endl;
}
}
\ No newline at end of file
#include "kernels_gpu.h"
/* Return how many dblocks are needed to store this entity on GPU. */
size_t gpu_dblocks(const entity_data_cpu& ed)
{
auto ks = get_kernel_gpu_sizes(ed.a_order);
size_t tot_elems = num_elems_all_orientations(ed);
size_t num_dblocks = tot_elems / ks.cells_per_dblock;
if (num_dblocks % ks.cells_per_dblock)
num_dblocks += 1;
return num_dblocks;
}
/* Return the total number of dofs (actual + padding) that are contained
* in the dblocks making up this entity. */
size_t gpu_dblocks_dofs(const entity_data_cpu& ed)
{
auto ks = get_kernel_gpu_sizes(ed.a_order);
size_t num_dblocks = gpu_dblocks(ed);
return num_dblocks * ks.dblock_size;
}
void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
const vecxd& in_field, vecxd& out_field, bool to_gpu)
{
auto ks = get_kernel_gpu_sizes(ed.a_order);
size_t tot_elems = num_elems_all_orientations(ed);
size_t num_dblocks = tot_elems / ks.cells_per_dblock;
if (num_dblocks % ks.cells_per_dblock)
num_dblocks += 1;
for (size_t i = 0; i < num_dblocks; i++)
{
for (size_t j = 0; j < ks.cells_per_dblock; j++)
{
auto cur_elem = ks.cells_per_dblock*i + j;
if (cur_elem >= tot_elems)
break;
auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
if (to_gpu)
for (size_t k = 0; k < ks.num_bf; k++)
out_field(gpu_base + k) = in_field(cpu_base + k);
else
for (size_t k = 0; k < ks.num_bf; k++)
out_field(cpu_base + k) = in_field(gpu_base + k);
}
}
}
......@@ -10,6 +10,8 @@
#include "kernels_cpu.h"
#include "timecounter.h"
#include "kernels_gpu.h"
using namespace sgr;
static void
......@@ -58,22 +60,34 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
auto& e0 = mod.entity_at(0);
e0.sort_by_orientation();
vecxd Pf_e0 = e0.project(f);
vecxd df_dx_e0 = vecxd::Zero( Pf_e0.rows() );
vecxd df_dy_e0 = vecxd::Zero( Pf_e0.rows() );
vecxd df_dz_e0 = vecxd::Zero( Pf_e0.rows() );
vecxd Pf_cpu = e0.project(f);
/* Make CPU entity data */
entity_data_cpu ed;
e0.populate_entity_data(ed);
/* Derive GPU entity data */
entity_data_gpu edg(ed);
#if 0
timecounter tc;
/* Prepare I/O vectors and call kernel */
device_vector<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size());
device_vector<double> df_dx_gpu(Pf_cpu.size());
device_vector<double> df_dy_gpu(Pf_cpu.size());
device_vector<double> df_dz_gpu(Pf_cpu.size());
timecounter_gpu tc;
tc.tic();
compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(),
df_dy_gpu.data(), df_dz_gpu.data());
double time = tc.toc();
vecxd df_dx_cpu = vecxd::Zero( Pf_cpu.size() );
vecxd df_dy_cpu = vecxd::Zero( Pf_cpu.size() );
vecxd df_dz_cpu = vecxd::Zero( Pf_cpu.size() );
df_dx_gpu.copyout( df_dx_cpu.data() );
df_dy_gpu.copyout( df_dy_cpu.data() );
df_dz_gpu.copyout( df_dz_cpu.data() );
if (geometric_order == 1)
{
std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
......@@ -101,9 +115,9 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
auto& re = e0.cell_refelem(pe);
matxd mass = e0.mass_matrix(iT);
auto num_bf = re.num_basis_functions();
vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_e0.segment(iT*num_bf, num_bf);
vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_e0.segment(iT*num_bf, num_bf);
vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_e0.segment(iT*num_bf, num_bf);
vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_cpu.segment(iT*num_bf, num_bf);
vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_cpu.segment(iT*num_bf, num_bf);
vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_cpu.segment(iT*num_bf, num_bf);
err_x += diff_x.dot(mass*diff_x);
err_y += diff_y.dot(mass*diff_y);
......@@ -115,14 +129,12 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
errs_x.push_back( std::sqrt(err_x) );
errs_y.push_back( std::sqrt(err_y) );
errs_z.push_back( std::sqrt(err_z) );
#endif
}
double rate_x = 0.0;
double rate_y = 0.0;
double rate_z = 0.0;
#if 0
std::cout << Byellowfg << "rate df/dx rate df/dy rate df/dz" << reset << std::endl;
for (size_t i = 1; i < sizes.size(); i++)
{
......@@ -134,7 +146,6 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2);
COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2);
COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2);
#endif
return 0;
}
......@@ -150,8 +161,8 @@ int main(void)
int failed_tests = 0;
std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
for (size_t go = 1; go < 5; go++)
for (size_t ao = go; ao < 5; ao++)
for (size_t go = 1; go < 2; go++)
for (size_t ao = go; ao < 4; ao++)
failed_tests += test_differentiation_convergence(go, ao);
return failed_tests;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment