Skip to content
Snippets Groups Projects
Commit ce97b241 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Testing perf optimization in lifting.

parent 81b5ae3b
No related branches found
No related tags found
No related merge requests found
...@@ -45,11 +45,33 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed) ...@@ -45,11 +45,33 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
auto LMrows = ed.num_bf; auto LMrows = ed.num_bf;
auto LMcols = 4*ed.num_fluxes*ed.num_orientations; auto LMcols = 4*ed.num_fluxes*ed.num_orientations;
MatxCM<double> lm = MatxCM<double>::Zero(LMrows, LMcols); MatxCM<double> lm = MatxCM<double>::Zero(LMrows, LMcols);
for (size_t i = 0; i < ed.num_orientations; i++) for (size_t iO = 0; iO < ed.num_orientations; iO++)
{ {
/*
lm.block(0, 4*i*ed.num_fluxes, ed.num_bf, 4*ed.num_fluxes) = lm.block(0, 4*i*ed.num_fluxes, ed.num_bf, 4*ed.num_fluxes) =
ed.lifting_matrices.block(i*ed.num_bf, 0, ed.num_bf, 4*ed.num_fluxes); ed.lifting_matrices.block(i*ed.num_bf, 0, ed.num_bf, 4*ed.num_fluxes);
*/
for (size_t i = 0; i < ed.num_bf; i++)
{
for (size_t j = 0; j < 4*ed.num_fluxes; j++)
{
auto src_row = iO*ed.num_bf+i;
auto src_col = j;
auto dst_row = i;
auto dst_col = iO*4*ed.num_fluxes + ( ((4*ed.num_fluxes - i) + j)%(4*ed.num_fluxes) );
lm(dst_row, dst_col) = ed.lifting_matrices(src_row, src_col);
}
}
} }
/*
matxd orig = ed.lifting_matrices.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
matxd rotated = lm.block(0, 0, ed.num_bf, 4*ed.num_fluxes);
std::cout << "*** ORIG ***" << std::endl;
std::cout << orig << std::endl;
std::cout << "*** ROTATED ***" << std::endl;
std::cout << rotated << std::endl;
*/
lifting_matrices.init(lm.data(), lm.size()); lifting_matrices.init(lm.data(), lm.size());
jacobians.copyin(ed.jacobians.data(), ed.jacobians.size()); jacobians.copyin(ed.jacobians.data(), ed.jacobians.size());
......
...@@ -135,11 +135,13 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex, ...@@ -135,11 +135,13 @@ gpu_lift_planar(gpuTextureObject_t flux, gpuTextureObject_t LM_tex,
int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO; int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO;
double inv_det = 1./dets[iT]; double inv_det = 1./dets[iT];
int32_t delta = ofs_in_entity % KS::num_bf;
double acc = 0.0; double acc = 0.0;
for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++) for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
{ {
int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof; int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
int32_t f_ofs = elem_flux_base + dof; int32_t f_ofs = elem_flux_base + (dof+delta)%(4*KS::num_fluxes);
acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs); acc += inv_det * fetch_tex(LM_tex, l_ofs) * fetch_tex(flux, f_ofs);
} }
...@@ -150,7 +152,7 @@ template<size_t K> ...@@ -150,7 +152,7 @@ template<size_t K>
void void
launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out) launch_lift_kernel(entity_data_gpu& edg, gpuTextureObject_t f, double *out)
{ {
const auto THREADS_PER_BLOCK = 256;//kernel_gpu_sizes<K>::deriv_threads; const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK; auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK) if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
num_blocks += 1; num_blocks += 1;
......
...@@ -252,7 +252,7 @@ int main(void) ...@@ -252,7 +252,7 @@ int main(void)
int failed_tests = 0; int failed_tests = 0;
std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl; std::cout << Bmagentafg << " *** TESTING: LIFTING ***" << reset << std::endl;
for (size_t ao = 1; ao < 2; ao++) for (size_t ao = 1; ao < 4; ao++)
test_lifting(1, ao); test_lifting(1, ao);
gmsh::finalize(); gmsh::finalize();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment