Skip to content
Snippets Groups Projects
Commit cbf819d4 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Testing blocked differentiation kernel

parent ce97b241
No related branches found
No related tags found
No related merge requests found
......@@ -116,6 +116,11 @@ if (OPT_ENABLE_GPU_SOLVER)
add_definitions(-D__HIP_PLATFORM_HCC__)
endif()
endif()
option(OPT_BLOCKED_GPU_KERNELS "Use blocked kernels on GPU" OFF)
if (OPT_BLOCKED_GPU_KERNELS)
add_definitions(-DUSE_BLOCKED_GPU_KERNELS)
endif()
endif()
if (OPT_ENABLE_GPU_SOLVER)
......
......@@ -10,41 +10,61 @@ struct kernel_gpu_sizes;
template<>
struct kernel_gpu_sizes<1>
{
static const size_t num_bf = num_dofs_3D(1);
static const size_t num_fluxes = num_dofs_2D(1);
static const size_t num_bf = num_dofs_3D(1); //4
static const size_t num_fluxes = num_dofs_2D(1); //3
static const size_t deriv_threads = 128;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 32;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
template<>
struct kernel_gpu_sizes<2>
{
static const size_t num_bf = num_dofs_3D(2);
static const size_t num_fluxes = num_dofs_2D(2);
static const size_t num_bf = num_dofs_3D(2); //10
static const size_t num_fluxes = num_dofs_2D(2); //6
static const size_t deriv_threads = 128;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 12;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
template<>
struct kernel_gpu_sizes<3>
{
static const size_t num_bf = num_dofs_3D(3);
static const size_t num_fluxes = num_dofs_2D(3);
static const size_t num_bf = num_dofs_3D(3); //20
static const size_t num_fluxes = num_dofs_2D(3); //10
static const size_t deriv_threads = 128;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 6;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
template<>
struct kernel_gpu_sizes<4>
{
static const size_t num_bf = num_dofs_3D(4);
static const size_t num_bf = num_dofs_3D(4); //35
static const size_t num_fluxes = num_dofs_2D(4);
static const size_t deriv_threads = 512;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 3;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
template<>
......@@ -55,6 +75,11 @@ struct kernel_gpu_sizes<5>
static const size_t deriv_threads = 1024;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 32;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
template<>
......@@ -65,19 +90,30 @@ struct kernel_gpu_sizes<6>
static const size_t deriv_threads = 1024;
static const size_t lifting_threads = 128;
static const size_t cells_per_dblock = 32;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
};
struct kernel_gpu_sizes_runtime
{
size_t num_bf;
size_t cells_per_dblock;
size_t dblock_size;
};
template<size_t N>
kernel_gpu_sizes_runtime get_kernel_gpu_sizes()
{
kernel_gpu_sizes_runtime ret;
using KS = kernel_gpu_sizes<N>;
ret.num_bf = kernel_gpu_sizes<N>::num_bf;
ret.num_bf = KS::num_bf;
ret.cells_per_dblock = KS::cells_per_dblock;
ret.dblock_size = KS::dblock_size;
return ret;
......@@ -90,6 +126,8 @@ get_kernel_gpu_sizes(int approx_order)
{
case 1: return get_kernel_gpu_sizes<1>();
case 2: return get_kernel_gpu_sizes<2>();
case 3: return get_kernel_gpu_sizes<3>();
case 4: return get_kernel_gpu_sizes<4>();
}
throw std::invalid_argument("Can't retrieve info for this element");
......
......@@ -14,7 +14,7 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J,
if (ofs_in_entity >= num_all_elems*KS::num_bf)
return;
int32_t cur_dof_offset = dof_base + ofs_in_entity;
int32_t output_dof_offset = dof_base + ofs_in_entity;
int32_t iT = ofs_in_entity / KS::num_bf;
int32_t elem_dof_base = dof_base + iT*KS::num_bf;
......@@ -48,9 +48,68 @@ gpu_deriv_planar(gpuTextureObject_t F, const double * __restrict__ J,
accm_dF_dz += J[jac_ofs+8] * v;
}
dF_dx[cur_dof_offset] = accm_dF_dx;
dF_dy[cur_dof_offset] = accm_dF_dy;
dF_dz[cur_dof_offset] = accm_dF_dz;
dF_dx[output_dof_offset] = accm_dF_dx;
dF_dy[output_dof_offset] = accm_dF_dy;
dF_dz[output_dof_offset] = accm_dF_dz;
}
template<size_t K>
__global__ void
gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
double * __restrict__ dF_dy, double * __restrict__ dF_dz,
int32_t num_all_elems, int32_t* orients, int32_t dof_base)
{
using KS = kernel_gpu_sizes<K>;
if (threadIdx.x >= KS::dofs_per_dblock)
return;
int32_t elem_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::cells_per_dblock;
int32_t elem_ofs = threadIdx.x / KS::num_bf;
int32_t iT = elem_base + elem_ofs;
if (iT >= num_all_elems)
return;
int32_t block_dof_base = (blockIdx.y * blockDim.y + threadIdx.y) * KS::dblock_size;
int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
int32_t iO = orients[iT];
int32_t DM_row = threadIdx.x % KS::num_bf;
int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
double accm_dF_dx = 0.0;
double accm_dF_dy = 0.0;
double accm_dF_dz = 0.0;
int32_t jac_ofs = 9*iT;
for (int32_t dof = 0; dof < KS::num_bf; dof++)
{
int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof;
int32_t f_ofs = elem_dof_base + dof;
double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
accm_dF_dx += J[jac_ofs+0] * v;
accm_dF_dy += J[jac_ofs+3] * v;
accm_dF_dz += J[jac_ofs+6] * v;
d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf;
v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
accm_dF_dx += J[jac_ofs+1] * v;
accm_dF_dy += J[jac_ofs+4] * v;
accm_dF_dz += J[jac_ofs+7] * v;
d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
accm_dF_dx += J[jac_ofs+2] * v;
accm_dF_dy += J[jac_ofs+5] * v;
accm_dF_dz += J[jac_ofs+8] * v;
}
dF_dx[output_dof_offset] = accm_dF_dx;
dF_dy[output_dof_offset] = accm_dF_dy;
dF_dz[output_dof_offset] = accm_dF_dz;
}
template<size_t K>
......@@ -58,22 +117,37 @@ void
launch_deriv_kernel(entity_data_gpu& edg,
gpuTextureObject_t f, double *df_dx, double* df_dy, double* df_dz)
{
const auto THREADS_PER_BLOCK = kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
auto J = edg.jacobians.data();
auto Dtex = edg.differentiation_matrices.get_texture();
int32_t num_elems = edg.num_all_elems;
auto orients = edg.element_orientation.data();
auto num_orients = edg.element_orientation.size();
using KS = kernel_gpu_sizes<K>;
#ifdef USE_BLOCKED_GPU_KERNELS
size_t num_blocks = edg.num_all_elems / (KS::cells_per_dblock * KS::parallel_dblocks);
if (num_blocks % (KS::cells_per_dblock * KS::parallel_dblocks))
num_blocks += 1;
dim3 grid_size(1, num_blocks);
dim3 block_size(KS::dblock_size, KS::parallel_dblocks);
if (edg.g_order == 1)
gpu_deriv_planar_blocked<K><<<grid_size, block_size>>>(f, J,
Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
#else
auto num_blocks = edg.num_bf*edg.num_all_elems/KS::deriv_threads;
if (edg.num_bf*edg.num_all_elems % KS::deriv_threads)
num_blocks += 1;
if (edg.g_order == 1)
gpu_deriv_planar<K><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads>>>(f, J,
Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
//else
// compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
#endif
}
void
......@@ -152,7 +226,7 @@ template<size_t K>
void
launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
{
const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
......
#include "kernels_gpu.h"
#if 0
/* Return how many dblocks are needed to store this entity on GPU. */
size_t gpu_dblocks(const entity_data_cpu& ed)
{
......@@ -37,7 +36,7 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
{
for (size_t j = 0; j < ks.cells_per_dblock; j++)
{
auto cur_elem = ks.cells_per_dblock*i + j;
auto cur_elem = ks.cells_per_dblock*i + j;
if (cur_elem >= tot_elems)
break;
......@@ -53,4 +52,4 @@ void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
}
}
}
#endif
......@@ -89,12 +89,15 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
vecxd Pdf_dx = vecxd::Zero(model_num_dofs);
vecxd Pdf_dy = vecxd::Zero(model_num_dofs);
vecxd Pdf_dz = vecxd::Zero(model_num_dofs);
vecxd Cdf_dx = vecxd::Zero(model_num_dofs);
vecxd Cdf_dy = vecxd::Zero(model_num_dofs);
vecxd Cdf_dz = vecxd::Zero(model_num_dofs);
std::vector<entity_data_gpu> edgs;
#ifdef USE_BLOCKED_GPU_KERNELS
std::vector<entity_data_cpu> eds;
size_t model_num_dofs_gpu = 0;
#else
size_t model_num_dofs_gpu = model_num_dofs;
#endif
for (auto& e : mod)
{
e.project(f, Pf);
......@@ -104,14 +107,27 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
entity_data_cpu ed;
e.populate_entity_data(ed);
entity_data_gpu edg(ed);
#ifdef USE_BLOCKED_GPU_KERNELS
edg.dof_base = model_num_dofs_gpu;
model_num_dofs_gpu += gpu_dblocks_dofs(ed);
eds.push_back( std::move(ed) );
#endif
edgs.push_back( std::move(edg) );
}
/* Prepare I/O vectors and call kernel */
#ifdef USE_BLOCKED_GPU_KERNELS
assert(eds.size() == edgs.size());
vecxd Pf_reshaped = vecxd::Zero(model_num_dofs_gpu);
for (size_t i = 0; i < eds.size(); i++)
reshape_dofs(eds[i], edgs[i], Pf, Pf_reshaped, true);
texture_allocator<double> Pf_gpu(Pf_reshaped.data(), Pf_reshaped.size());
#else
texture_allocator<double> Pf_gpu(Pf.data(), Pf.size());
device_vector<double> df_dx_gpu(model_num_dofs);
device_vector<double> df_dy_gpu(model_num_dofs);
device_vector<double> df_dz_gpu(model_num_dofs);
#endif
device_vector<double> df_dx_gpu(model_num_dofs_gpu);
device_vector<double> df_dy_gpu(model_num_dofs_gpu);
device_vector<double> df_dz_gpu(model_num_dofs_gpu);
for (auto& edg : edgs)
{
......@@ -136,9 +152,37 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
}
}
#ifdef USE_BLOCKED_GPU_KERNELS
vecxd Cdf_dx_exp = vecxd::Zero(model_num_dofs_gpu);
vecxd Cdf_dy_exp = vecxd::Zero(model_num_dofs_gpu);
vecxd Cdf_dz_exp = vecxd::Zero(model_num_dofs_gpu);
df_dx_gpu.copyout( Cdf_dx_exp.data() );
df_dy_gpu.copyout( Cdf_dy_exp.data() );
df_dz_gpu.copyout( Cdf_dz_exp.data() );
vecxd Cdf_dx = vecxd::Zero(model_num_dofs);
vecxd Cdf_dy = vecxd::Zero(model_num_dofs);
vecxd Cdf_dz = vecxd::Zero(model_num_dofs);
assert(eds.size() == edgs.size());
Pf = vecxd::Zero(model_num_dofs);
for (size_t i = 0; i < eds.size(); i++)
{
reshape_dofs(eds[i], edgs[i], Cdf_dx_exp, Cdf_dx, false);
reshape_dofs(eds[i], edgs[i], Cdf_dy_exp, Cdf_dy, false);
reshape_dofs(eds[i], edgs[i], Cdf_dz_exp, Cdf_dz, false);
reshape_dofs(eds[i], edgs[i], Pf_reshaped, Pf, false);
}
#else
vecxd Cdf_dx = vecxd::Zero(model_num_dofs_gpu);
vecxd Cdf_dy = vecxd::Zero(model_num_dofs_gpu);
vecxd Cdf_dz = vecxd::Zero(model_num_dofs_gpu);
df_dx_gpu.copyout( Cdf_dx.data() );
df_dy_gpu.copyout( Cdf_dy.data() );
df_dz_gpu.copyout( Cdf_dz.data() );
#endif
double err_x = 0.0;
double err_y = 0.0;
......@@ -173,6 +217,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
auto gi = e.cell_global_index_by_gmsh(iT);
assert(gi < model_num_cells);
vecxd phi_bar = re.basis_functions({1./3., 1./3., 1./3.});
var_Pf[gi] = Pf.segment(ofs, num_bf).dot(phi_bar);
var_df_dx[gi] = Cdf_dx.segment(ofs, num_bf).dot(phi_bar);
var_df_dy[gi] = Cdf_dy.segment(ofs, num_bf).dot(phi_bar);
var_df_dz[gi] = Cdf_dz.segment(ofs, num_bf).dot(phi_bar);
......@@ -183,6 +228,7 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
}
#ifdef WRITE_TEST_OUTPUTS
silodb.write_zonal_variable("f", var_Pf);
silodb.write_zonal_variable("df_dx", var_df_dx);
silodb.write_zonal_variable("df_dy", var_df_dy);
silodb.write_zonal_variable("df_dz", var_df_dz);
......@@ -224,7 +270,7 @@ int main(void)
std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
for (size_t go = 1; go < 2; go++)
for (size_t ao = go; ao < 7; ao++)
for (size_t ao = go; ao < 5; ao++)
failed_tests += test_differentiation_convergence(go, ao);
return failed_tests;
......
......@@ -252,7 +252,7 @@ int main(void)
int failed_tests = 0;
std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl;
for (size_t ao = 1; ao < 4; ao++)
for (size_t ao = 1; ao < 7; ao++)
test_lifting(1, ao);
gmsh::finalize();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment