Skip to content
Snippets Groups Projects
Commit d7deccc9 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Lifting perf tests

parent cbf819d4
Branches
Tags
No related merge requests found
......@@ -19,7 +19,7 @@ struct kernel_gpu_sizes<1>
static const size_t cells_per_dblock = 32;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
static const size_t parallel_dblocks = 1;
};
template<>
......@@ -34,7 +34,7 @@ struct kernel_gpu_sizes<2>
static const size_t cells_per_dblock = 12;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
static const size_t parallel_dblocks = 1;
};
template<>
......@@ -49,7 +49,7 @@ struct kernel_gpu_sizes<3>
static const size_t cells_per_dblock = 6;
static const size_t dofs_per_dblock = num_bf * cells_per_dblock;
static const size_t dblock_size = 128;
static const size_t parallel_dblocks = 4;
static const size_t parallel_dblocks = 1;
};
template<>
......@@ -147,4 +147,4 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
gpuTextureObject_t F, double *dF_dx, double* dF_dy, double* dF_dz);
void
gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out);
\ No newline at end of file
gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out);
\ No newline at end of file
......@@ -58,7 +58,7 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
auto src_row = iO*ed.num_bf+i;
auto src_col = j;
auto dst_row = i;
auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
auto dst_col = iO*4*ed.num_fluxes + j;//( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col);
}
}
......
......@@ -189,7 +189,7 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
template<size_t K>
__global__ void
gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
const double * __restrict__ dets, double * __restrict__ lifted_flux,
int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
{
......@@ -215,8 +215,8 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
{
int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes);
acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs);
int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes);
acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
}
lifted_flux[cur_dof_offset] += acc;
......@@ -224,9 +224,9 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
template<size_t K>
void
launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out)
{
const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads;
const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1;
......@@ -245,7 +245,7 @@ launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
}
void
gpu_compute_flux_lifting(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out)
{
switch (edg.a_order)
{
......
......@@ -137,14 +137,14 @@ int test_lifting(int geometric_order, int approximation_order)
edgs.push_back( std::move(edg) );
}
texture_allocator<double> PFdotn_gpu(PFdotn.data(), PFdotn.size());
device_vector<double> PFdotn_gpu(PFdotn.data(), PFdotn.size());
device_vector<double> LiftF_gpu(LiftF.data(), LiftF.size());
for (auto& edg : edgs)
{
timecounter_gpu tc;
tc.tic();
gpu_compute_flux_lifting(edg, PFdotn_gpu.get_texture(), LiftF_gpu.data());
gpu_compute_flux_lifting(edg, PFdotn_gpu.data(), LiftF_gpu.data());
double time = tc.toc();
auto num_cells = edg.num_all_elems;
......@@ -153,6 +153,12 @@ int test_lifting(int geometric_order, int approximation_order)
std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
double flops = 3*(edg.num_bf)*4*edg.num_fluxes*num_cells;
std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
auto read_gbs = 8*4*edg.num_fluxes*num_cells/(1e9*time);
auto write_gbs = 8*edg.num_bf*num_cells/(1e9*time);
auto tot_gbs = read_gbs + write_gbs;
std::cout << "Read: " << read_gbs << " GB/s, write: " << write_gbs;
std::cout << " GB/s, total: " << tot_gbs << " GB/s" << std::endl;
}
else
{
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment