Skip to content
Snippets Groups Projects
Commit ce8f502b authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Templated differentiation kernel.

parent 1ab9f794
No related branches found
No related tags found
No related merge requests found
......@@ -258,6 +258,7 @@ entity::populate_entity_data(entity_data_cpu<1>& ed) const
{
assert(g_order == 1);
assert(reference_cells.size() > 0);
ed.a_order = a_order;
ed.num_elems = num_elements_per_orientation();
ed.num_orientations = num_orientations();
ed.num_bf = reference_cells[0].num_basis_functions();
......
......@@ -8,6 +8,7 @@ struct entity_data_cpu;
template<>
struct entity_data_cpu<1>
{
int a_order;
std::vector<size_t> num_elems; /* No. of elements in the entity */
size_t num_orientations; /* No. of bf. orientations */
size_t num_bf; /* No. of dofs per elem */
......
#include "types.h"
#include "entity_data.h"
#include "kernels_cpu.h"
/* Field derivatives kernel, case for elements with planar faces */
void compute_field_derivatives(const entity_data_cpu<1>& ed,
const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
{
/* The elements must be ordered by orientation -> entity::sort_by_orientation() */
size_t orient_elem_base = 0;
for (size_t iO = 0; iO < ed.num_orientations; iO++)
switch (ed.a_order)
{
size_t num_elems = ed.num_elems[iO];
size_t orient_base = orient_elem_base * ed.num_bf;
case 1:
compute_field_derivatives_kernel<1>(ed, f, df_dx, df_dy, df_dz);
break;
for (size_t iT = 0; iT < num_elems; iT++)
{
size_t ent_iT = orient_elem_base + iT;
size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf;
case 2:
compute_field_derivatives_kernel<2>(ed, f, df_dx, df_dy, df_dz);
break;
for (size_t odof = 0; odof < ed.num_bf; odof++)
{
double acc_df_dx = 0.0;
double acc_df_dy = 0.0;
double acc_df_dz = 0.0;
case 3:
compute_field_derivatives_kernel<3>(ed, f, df_dx, df_dy, df_dz);
break;
for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++)
{
for (size_t idof = 0; idof < ed.num_bf; idof++)
{
auto dm_row = iO*ed.num_bf + odof;
auto dm_col = refsys_dir*ed.num_bf + idof;
double D = ed.differentiation_matrix(dm_row, dm_col);
acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof);
acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof);
acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof);
}
}
case 4:
compute_field_derivatives_kernel<4>(ed, f, df_dx, df_dy, df_dz);
break;
case 5:
compute_field_derivatives_kernel<5>(ed, f, df_dx, df_dy, df_dz);
break;
df_dx(dofofs+odof) = acc_df_dx;
df_dy(dofofs+odof) = acc_df_dy;
df_dz(dofofs+odof) = acc_df_dz;
}
}
orient_elem_base += num_elems;
default:
std::cout << "compute_field_derivatives: invalid order" << std::endl;
}
}
\ No newline at end of file
#pragma once
void compute_field_derivatives(const entity_data_cpu<1>& ed,
const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz);
template<size_t AK>
struct num_dofs_3D
{
static const size_t value = ((AK+3)*(AK+2)*(AK+1))/6;
};
template<size_t AK>
struct num_dofs_2D
{
static const size_t value = ((AK+2)*(AK+1))/2;
};
template<size_t AK>
struct kernel_cpu_sizes
{
static const size_t num_bf = num_dofs_3D<AK>::value;
static const size_t num_fluxes = num_dofs_2D<AK>::value;
static const size_t num_faces = 4;
static const size_t num_all_fluxes = num_fluxes * num_faces;
};
template<size_t AK>
void compute_field_derivatives_kernel(const entity_data_cpu<1>& ed,
const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz)
{
/* Flop count: 3*ed.num_bf*3*ed.num_bf*num_total_elems */
/* The elements must be ordered by orientation -> entity::sort_by_orientation() */
using KS = kernel_cpu_sizes<AK>;
assert(ed.num_bf == KS::num_bf);
size_t orient_elem_base = 0;
for (size_t iO = 0; iO < ed.num_orientations; iO++)
{
size_t num_elems = ed.num_elems[iO];
size_t orient_base = orient_elem_base * ed.num_bf;
for (size_t iT = 0; iT < num_elems; iT++)
{
size_t ent_iT = orient_elem_base + iT;
size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf;
for (size_t odof = 0; odof < KS::num_bf; odof++)
{
double acc_df_dx = 0.0;
double acc_df_dy = 0.0;
double acc_df_dz = 0.0;
for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++)
{
for (size_t idof = 0; idof < KS::num_bf; idof++)
{
auto dm_row = iO*ed.num_bf + odof;
auto dm_col = refsys_dir*ed.num_bf + idof;
double D = ed.differentiation_matrix(dm_row, dm_col);
acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof);
acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof);
acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof);
}
}
df_dx(dofofs+odof) = acc_df_dx;
df_dy(dofofs+odof) = acc_df_dy;
df_dz(dofofs+odof) = acc_df_dz;
}
}
orient_elem_base += num_elems;
}
}
void compute_field_derivatives(const entity_data_cpu<1>& ed,
const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz);
\ No newline at end of file
......@@ -8,6 +8,7 @@
#include "sgr.hpp"
#include "entity_data.h"
#include "kernels_cpu.h"
#include "timecounter.h"
using namespace sgr;
......@@ -65,7 +66,13 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
entity_data_cpu<1> ed;
e0.populate_entity_data(ed);
timecounter tc;
tc.tic();
compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
double time = tc.toc();
std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
size_t flops = 3*ed.num_bf*3*ed.num_bf*e0.num_elements();
std::cout << flops/(1e9*time) << " GFlops/s" << std::endl;
vecxd Pdf_dx_e0 = e0.project(df_dx);
vecxd Pdf_dy_e0 = e0.project(df_dy);
......
#pragma once
#include <chrono>
#ifdef HAVE_CUDA_RT
#include <cuda_runtime.h>
#endif
class timecounter
{
std::chrono::time_point<std::chrono::steady_clock> begin, end;
public:
timecounter()
{}
void tic()
{
begin = std::chrono::steady_clock::now();
}
double toc()
{
end = std::chrono::steady_clock::now();
std::chrono::duration<double> diff = end-begin;
return diff.count();
}
};
#ifdef HAVE_CUDA_RT
class timecounter_cuda
{
cudaEvent_t start;
cudaEvent_t stop;
public:
timecounter_cuda()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~timecounter_cuda()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void tic()
{
cudaEventRecord(start, 0);
}
double toc()
{
cudaEventRecord(stop, 0);
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed/1000.0;
}
};
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment