diff --git a/entity.cpp b/entity.cpp index e8cad719a1ae7b246350f72e90a8c764957a6178..a06c7430102f1695991f2ccaea72e37df79e84b2 100644 --- a/entity.cpp +++ b/entity.cpp @@ -258,6 +258,7 @@ entity::populate_entity_data(entity_data_cpu<1>& ed) const { assert(g_order == 1); assert(reference_cells.size() > 0); + ed.a_order = a_order; ed.num_elems = num_elements_per_orientation(); ed.num_orientations = num_orientations(); ed.num_bf = reference_cells[0].num_basis_functions(); diff --git a/entity_data.h b/entity_data.h index 130ce394efb3290982793e9f9191e3883f725aa0..0352a1a561fd642422c589a75f3469af0a302142 100644 --- a/entity_data.h +++ b/entity_data.h @@ -8,6 +8,7 @@ struct entity_data_cpu; template<> struct entity_data_cpu<1> { + int a_order; std::vector<size_t> num_elems; /* No. of elements in the entity */ size_t num_orientations; /* No. of bf. orientations */ size_t num_bf; /* No. of dofs per elem */ diff --git a/kernels_cpu.cpp b/kernels_cpu.cpp index 1fe17ea9a4bc1c89c5124da5e8e80e88f0b5f78f..e8bec0f861f27916c9e682b968a0e8b648e45cfe 100644 --- a/kernels_cpu.cpp +++ b/kernels_cpu.cpp @@ -1,47 +1,34 @@ #include "types.h" #include "entity_data.h" +#include "kernels_cpu.h" /* Field derivatives kernel, case for elements with planar faces */ void compute_field_derivatives(const entity_data_cpu<1>& ed, const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz) { - /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ - size_t orient_elem_base = 0; - for (size_t iO = 0; iO < ed.num_orientations; iO++) + switch (ed.a_order) { - size_t num_elems = ed.num_elems[iO]; - size_t orient_base = orient_elem_base * ed.num_bf; + case 1: + compute_field_derivatives_kernel<1>(ed, f, df_dx, df_dy, df_dz); + break; - for (size_t iT = 0; iT < num_elems; iT++) - { - size_t ent_iT = orient_elem_base + iT; - size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf; + case 2: + compute_field_derivatives_kernel<2>(ed, f, df_dx, df_dy, df_dz); + break; - for (size_t odof = 0; odof < ed.num_bf; odof++) - { - double acc_df_dx = 0.0; - double acc_df_dy = 0.0; - double acc_df_dz = 0.0; + case 3: + compute_field_derivatives_kernel<3>(ed, f, df_dx, df_dy, df_dz); + break; - for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++) - { - for (size_t idof = 0; idof < ed.num_bf; idof++) - { - auto dm_row = iO*ed.num_bf + odof; - auto dm_col = refsys_dir*ed.num_bf + idof; - double D = ed.differentiation_matrix(dm_row, dm_col); - acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof); - acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof); - acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof); - } - } + case 4: + compute_field_derivatives_kernel<4>(ed, f, df_dx, df_dy, df_dz); + break; + + case 5: + compute_field_derivatives_kernel<5>(ed, f, df_dx, df_dy, df_dz); + break; - df_dx(dofofs+odof) = acc_df_dx; - df_dy(dofofs+odof) = acc_df_dy; - df_dz(dofofs+odof) = acc_df_dz; - } - } - - orient_elem_base += num_elems; + default: + std::cout << "compute_field_derivatives: invalid order" << std::endl; } } \ No newline at end of file diff --git a/kernels_cpu.h b/kernels_cpu.h index 08b5abd085b5ec60a12e318892543f1581902d9e..339bbe9b3c41d73d658a02c00312f3741d84a12d 100644 --- a/kernels_cpu.h +++ b/kernels_cpu.h @@ -1,5 +1,76 @@ #pragma once -void compute_field_derivatives(const entity_data_cpu<1>& ed, - const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz); +template<size_t AK> +struct num_dofs_3D +{ + static const size_t value = ((AK+3)*(AK+2)*(AK+1))/6; +}; + +template<size_t AK> +struct num_dofs_2D +{ + static const size_t value = ((AK+2)*(AK+1))/2; +}; + +template<size_t AK> +struct kernel_cpu_sizes +{ + static const size_t num_bf = num_dofs_3D<AK>::value; + static const size_t num_fluxes = num_dofs_2D<AK>::value; + static const size_t num_faces = 4; + static const size_t num_all_fluxes = num_fluxes * num_faces; +}; + + +template<size_t AK> +void compute_field_derivatives_kernel(const entity_data_cpu<1>& ed, + const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz) +{ + /* Flop count: 3*ed.num_bf*3*ed.num_bf*num_total_elems */ + /* The elements must be ordered by orientation -> entity::sort_by_orientation() */ + + using KS = kernel_cpu_sizes<AK>; + assert(ed.num_bf == KS::num_bf); + + size_t orient_elem_base = 0; + for (size_t iO = 0; iO < ed.num_orientations; iO++) + { + size_t num_elems = ed.num_elems[iO]; + size_t orient_base = orient_elem_base * ed.num_bf; + for (size_t iT = 0; iT < num_elems; iT++) + { + size_t ent_iT = orient_elem_base + iT; + size_t dofofs = ed.dofs_base + orient_base + iT*ed.num_bf; + + for (size_t odof = 0; odof < KS::num_bf; odof++) + { + double acc_df_dx = 0.0; + double acc_df_dy = 0.0; + double acc_df_dz = 0.0; + + for (size_t refsys_dir = 0; refsys_dir < 3; refsys_dir++) + { + for (size_t idof = 0; idof < KS::num_bf; idof++) + { + auto dm_row = iO*ed.num_bf + odof; + auto dm_col = refsys_dir*ed.num_bf + idof; + double D = ed.differentiation_matrix(dm_row, dm_col); + acc_df_dx += ed.jacobians(3*ent_iT + 0, refsys_dir) * D * f(dofofs+idof); + acc_df_dy += ed.jacobians(3*ent_iT + 1, refsys_dir) * D * f(dofofs+idof); + acc_df_dz += ed.jacobians(3*ent_iT + 2, refsys_dir) * D * f(dofofs+idof); + } + } + + df_dx(dofofs+odof) = acc_df_dx; + df_dy(dofofs+odof) = acc_df_dy; + df_dz(dofofs+odof) = acc_df_dz; + } + } + + orient_elem_base += num_elems; + } +} + +void compute_field_derivatives(const entity_data_cpu<1>& ed, + const vecxd& f, vecxd& df_dx, vecxd& df_dy, vecxd& df_dz); \ No newline at end of file diff --git a/test_differentiation.cpp b/test_differentiation.cpp index 74331e3d00781562322a066358311bc8695838b1..987345998d283c7f62c2e45bf84d80a5c706afb4 100644 --- a/test_differentiation.cpp +++ b/test_differentiation.cpp @@ -8,6 +8,7 @@ #include "sgr.hpp" #include "entity_data.h" #include "kernels_cpu.h" +#include "timecounter.h" using namespace sgr; @@ -65,7 +66,13 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde entity_data_cpu<1> ed; e0.populate_entity_data(ed); + timecounter tc; + tc.tic(); compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0); + double time = tc.toc(); + std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: "; + size_t flops = 3*ed.num_bf*3*ed.num_bf*e0.num_elements(); + std::cout << flops/(1e9*time) << " GFlops/s" << std::endl; vecxd Pdf_dx_e0 = e0.project(df_dx); vecxd Pdf_dy_e0 = e0.project(df_dy); diff --git a/timecounter.h b/timecounter.h new file mode 100644 index 0000000000000000000000000000000000000000..1654558be55ba78ac79810ee5424d68d03d898ff --- /dev/null +++ b/timecounter.h @@ -0,0 +1,64 @@ +#pragma once + +#include <chrono> + +#ifdef HAVE_CUDA_RT +#include <cuda_runtime.h> +#endif + +class timecounter +{ + std::chrono::time_point<std::chrono::steady_clock> begin, end; + +public: + timecounter() + {} + + void tic() + { + begin = std::chrono::steady_clock::now(); + } + + double toc() + { + end = std::chrono::steady_clock::now(); + std::chrono::duration<double> diff = end-begin; + return diff.count(); + } +}; + +#ifdef HAVE_CUDA_RT +class timecounter_cuda +{ + cudaEvent_t start; + cudaEvent_t stop; + +public: + timecounter_cuda() + { + cudaEventCreate(&start); + cudaEventCreate(&stop); + } + + ~timecounter_cuda() + { + cudaEventDestroy(start); + cudaEventDestroy(stop); + } + + void tic() + { + cudaEventRecord(start, 0); + } + + double toc() + { + cudaEventRecord(stop, 0); + + float elapsed; + cudaEventSynchronize(stop); + cudaEventElapsedTime(&elapsed, start, stop); + return elapsed/1000.0; + } +}; +#endif