Started rewriting cuda kernels.

723ff703 · Matteo Cicuttin · c9a1ce9e · 723ff703 · 723ff703 · 723ff703
Commit 723ff703 authored 4 years ago by Matteo Cicuttin
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,7 +66,6 @@ link_directories(${GMSH_LIB})

 ######################################################################
 ## Use or not use GPU solvers
-
 option(OPT_ENABLE_GPU_SOLVER "Enable GPU solvers" OFF)
 if (OPT_ENABLE_GPU_SOLVER)
    add_definitions(-DENABLE_GPU_SOLVER)
@@ -117,9 +116,14 @@ if (OPT_ENABLE_GPU_SOLVER)
            add_definitions(-D__HIP_PLATFORM_HCC__)
        endif()
    endif()
-
 endif()

+if (OPT_ENABLE_GPU_SOLVER)
+    option(OPT_DEBUG_GPU "Enable GPU debugging" OFF)
+    if (OPT_DEBUG_GPU)
+        add_definitions(-DGPU_DEBUG)
+    endif()
+endif()

 ######################################################################
 ## Compiler identification stuff
@@ -235,12 +239,15 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 set(COMMON_SOURCES gmsh_io.cpp reference_element.cpp physical_element.cpp entity.cpp kernels_cpu.cpp connectivity.cpp entity_data.cpp)

 set(LIBGMSHDG_SOURCES src/gmsh_io.cpp
+                      src/entity.cpp
                      src/reference_element.cpp
                      src/physical_element.cpp
-                      src/entity.cpp
-                      src/kernels_cpu.cpp
                      src/connectivity.cpp
-                      src/entity_data.cpp)
+                      src/entity_data.cpp
+                      src/kernels_cpu.cpp
+                      src/kernels_gpu_mi.cpp
+                      src/kernels_cuda.cu
+                    )

 add_library(gmshdg SHARED ${LIBGMSHDG_SOURCES})
 target_link_libraries(gmshdg ${LINK_LIBS})

--- a/include/eigen.h
+++ b/include/eigen.h
@@ -14,3 +14,4 @@ using MatxCM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>

 template<typename T>
 using MatxRM = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
--- a/include/entity_data.h
+++ b/include/entity_data.h
 #pragma once

+#include <numeric>
+
 #include "eigen.h"
 #include "types.h"

@@ -57,13 +59,20 @@ struct entity_data_cpu
    vecxd               face_determinants;
 };

+inline size_t
+num_elems_all_orientations(const entity_data_cpu& ed)
+{
+    assert(ed.num_elems.size() == ed.num_orientations);
+    return std::accumulate(ed.num_elems.begin(), ed.num_elems.end(), 0);
+}
+
 #ifdef ENABLE_GPU_SOLVER
 struct entity_data_gpu
 {
    int                 g_order;
    int                 a_order;
-    size_t*             num_elems;
-    size_t              num_orientations;
+    size_t              num_all_elems;
+    size_t *            element_orientation;
    size_t              num_bf;

    size_t              dof_base;
@@ -79,13 +88,12 @@ struct entity_data_gpu
    ~entity_data_gpu();
 };

-
 template<typename T>
 void
 gpu_copyin(const std::vector<T> src, T** dst)
 {
    checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }

 template<typename T>
@@ -93,7 +101,7 @@ void
 gpu_copyin(const MatxCM<T>& src, T** dst)
 {
    checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }

 template<typename T>
@@ -101,7 +109,7 @@ void
 gpu_copyin(const MatxRM<T>& src, T** dst)
 {
    checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }

 template<typename T>
@@ -109,7 +117,9 @@ void
 gpu_copyin(const Eigen::Matrix<T, Eigen::Dynamic, 1>& src, double** dst)
 {
    checkGPU( gpuMalloc( (void**)dst, src.size()*sizeof(T) ) );
-    checkGPU( gpuMemcpy( dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
+    checkGPU( gpuMemcpy( *dst, src.data(), src.size()*sizeof(T), gpuMemcpyHostToDevice) );
 }

 #endif /* ENABLE_GPU_SOLVER */
+
+
--- a/include/gpu_support.hpp
+++ b/include/gpu_support.hpp
@@ -168,10 +168,10 @@ public:
    device_vector(T *src, size_t size)
        : d_vptr(nullptr), vsize(0)
    {
-        init(src, size);
+        copyin(src, size);
    }

-    void init(T *src, size_t size)
+    void copyin(T *src, size_t size)
    {
        if (d_vptr != nullptr)
            (void) gpuFree(d_vptr);
@@ -181,6 +181,14 @@ public:
        vsize = size;
    }

+    void copyout(T *dst)
+    {
+        if (d_vptr == nullptr)
+            return;
+
+        (void)checkGPU( gpuMemcpy(dst, d_vptr, vsize*sizeof(T), gpuMemcpyDeviceToHost) );
+    }
+
    void resize(size_t size)
    {
        if (d_vptr != nullptr)
@@ -353,7 +361,7 @@ public:
        float elapsed;
        gpuEventSynchronize(stop);
        gpuEventElapsedTime(&elapsed, start, stop);
-        return elapsed/1000.0;
+        return double(elapsed)/1000.0;
    }
 };

--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -2,9 +2,12 @@

 #include "entity_data.h"

-template<size_t K>
-struct kernel_gpu_sizes;
-/* Compile-time constants configuring sizes for the kernels */
+/* Compile-time constants configuring sizes for the kernels.
+ * Elements are organized in blocks of cells_per_dblock elements each.
+ * Each dblock contains dblock_size dofs, where dblock_size must be a
+ * multiple of the warp size. This means that some dofs in the dblock
+ * will be padding. The effective dofs used are dblock_bf.*/
+
 template<size_t K>
 struct kernel_gpu_sizes;

@@ -13,7 +16,7 @@ struct kernel_gpu_sizes<1>
 {
    static const size_t num_bf              = 4;
    static const size_t cells_per_dblock    = 32;
-    static const size_t dblock_bf           = num_bf * vols_per_ublock;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
    static const size_t dblock_size         = 128;
    static const size_t parallel_dblocks    = 4;
 };
@@ -23,7 +26,17 @@ struct kernel_gpu_sizes<2>
 {
    static const size_t num_bf              = 10;
    static const size_t cells_per_dblock    = 12;
-    static const size_t dblock_bf           = num_bf * vols_per_ublock;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 1;
+};
+
+template<>
+struct kernel_gpu_sizes<3>
+{
+    static const size_t num_bf              = 20;
+    static const size_t cells_per_dblock    = 12;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
    static const size_t dblock_size         = 128;
    static const size_t parallel_dblocks    = 1;
 };
@@ -38,8 +51,7 @@ struct kernel_gpu_sizes_runtime
 };

 template<size_t N>
-kernel_gpu_sizes_runtime
-get_kernel_gpu_sizes()
+kernel_gpu_sizes_runtime get_kernel_gpu_sizes()
 {
    kernel_gpu_sizes_runtime ret;

@@ -47,13 +59,12 @@ get_kernel_gpu_sizes()
    ret.cells_per_dblock    = kernel_gpu_sizes<N>::cells_per_dblock;
    ret.dblock_bf           = kernel_gpu_sizes<N>::dblock_bf;
    ret.dblock_size         = kernel_gpu_sizes<N>::dblock_size;
-    ret.parallel_dblocks    = kernel_gpu_sizes<N>::slice_size;
+    ret.parallel_dblocks    = kernel_gpu_sizes<N>::parallel_dblocks;

    return ret;
 };

-inline
-kernel_gpu_sizes_runtime
+inline kernel_gpu_sizes_runtime
 get_kernel_gpu_sizes(int approx_order)
 {
    switch (approx_order)
@@ -65,49 +76,15 @@ get_kernel_gpu_sizes(int approx_order)
    throw std::invalid_argument("Can't retrieve info for this element");
 }

-/* Return the number of dofs that this entity will take on GPU,
- * including all the padding dofs */
-size_t gpu_dofs(const entity_data_cpu& ed)
-{
-    auto ks = get_kernel_gpu_sizes(ed.a_order);
-
-    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
-    if (num_dblocks % ks.cells_per_dblock)
-        num_dblocks += 1;
-
-    return num_dblocks * ks.dblock_size;
-}
+size_t gpu_dblocks(const entity_data_cpu&);
+size_t gpu_dblocks_dofs(const entity_data_cpu&);

 #define TO_GPU      true
 #define FROM_GPU    false
+void reshape_dofs(const entity_data_cpu&, const entity_data_gpu&, const vecxd&, vecxd&, bool);

-void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
-                  const vecxd& in_field, vecxd& out_field, bool to_gpu)
-{
-    auto ks = get_kernel_gpu_sizes(ed.a_order);
-
-    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
-    if (num_dblocks % ks.cells_per_dblock)
-        num_dblocks += 1;

-    for (size_t i = 0; i < num_dblocks; i++)
-    {
-        for (size_t j = 0; j < ks.cells_per_dblock; j++)
-        {
-            auto cur_elem =  ks.cells_per_dblock*i + j;
-            if (cur_elem >= tot_elems)
-                break;
-            
-            auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
-            auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
-
-            if (to_gpu)
-                for (size_t k = 0; k < ks.num_bf; j++)
-                    out_field(gpu_base + k) = in_field(cpu_base + k);
-            else
-                for (size_t k = 0; k < ks.num_bf; j++)
-                    out_field(cpu_base + k) = in_field(gpu_base + k);
-        }
-    }
-}

+void
+gpu_compute_field_derivatives(entity_data_gpu& edg,
+    const double* F, double *dF_dx, double* dF_dy, double* dF_dz);
--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -2,7 +2,7 @@

 entity_data_gpu::entity_data_gpu()
 {
-    num_elems = nullptr;
+    element_orientation = nullptr;
    differentiation_matrices = nullptr;
    jacobians = nullptr;
    cell_determinants = nullptr;
@@ -13,8 +13,18 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
    g_order = ed.g_order;
    a_order = ed.a_order;

-    gpu_copyin(ed.num_elems, &num_elems);
-    num_orientations = ed.num_orientations;
+    num_all_elems = num_elems_all_orientations(ed);
+
+    std::vector<size_t> eo;
+    eo.reserve(num_all_elems);
+    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    {
+        for (size_t iT = 0; iT < ed.num_elems[iO]; iT++)
+            eo.push_back(iO);
+    }
+    assert(eo.size() == num_all_elems);
+    gpu_copyin(eo, &element_orientation);
+
    num_bf = ed.num_bf;

    dof_base = 0;
@@ -35,8 +45,9 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)

 entity_data_gpu::~entity_data_gpu()
 {
-    if (num_elems)
-        checkGPU( gpuFree(num_elems) );
+    /*
+    if (element_orientation)
+        checkGPU( gpuFree(element_orientation) );

    if (differentiation_matrices)
        checkGPU( gpuFree(differentiation_matrices) );
@@ -46,5 +57,6 @@ entity_data_gpu::~entity_data_gpu()

    if (cell_determinants)
        checkGPU( gpuFree(cell_determinants) );
+    */
 }

--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
+#include "entity_data.h"
+#include "kernels_gpu.h"
+
+template<size_t K>
+__global__ void
+gpu_compute_field_derivatives_kernel_planar(entity_data_gpu edg,
+    const double* F, double *dF_dx, double* dF_dy, double* dF_dz)
+{
+    using KS = kernel_gpu_sizes<K>;
+
+    int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
+
+    int32_t cur_dof_offset = edg.dof_base + ofs_in_entity;
+    if (cur_dof_offset >= edg.num_all_elems*edg.num_bf)
+        return;
+
+    int32_t iT = cur_dof_offset / KS::num_bf;
+    int32_t elem_dof_base = edg.dof_base + iT*edg.num_bf;
+    int32_t iO = edg.element_orientation[iT];
+    int32_t DM_row = ofs_in_entity % KS::num_bf;
+    int32_t DM_orient = 3*KS::num_bf*iO*KS::num_bf;
+    double *DM = edg.differentiation_matrices + DM_orient;
+
+    double accm_dF_dx = 0.0;
+    double accm_dF_dy = 0.0;
+    double accm_dF_dz = 0.0;
+
+    int32_t jac_ofs = 9*iT;
+    double J[9];
+    for (int32_t i = 0; i < 9; i++)
+        J[i] = edg.jacobians[jac_ofs+i];
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[0] * v;
+        accm_dF_dy += J[3] * v;
+        accm_dF_dz += J[6] * v;
+    }
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + KS::num_bf*KS::num_bf + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[1] * v;
+        accm_dF_dy += J[4] * v;
+        accm_dF_dz += J[7] * v;
+    }
+
+    for (int32_t dof = 0; dof < KS::num_bf; dof++)
+    {
+        int32_t d_ofs = DM_row + 2*KS::num_bf*KS::num_bf + KS::num_bf*dof;
+        int32_t f_ofs = elem_dof_base + dof;
+        double v = DM[d_ofs] * F[f_ofs];
+        accm_dF_dx += J[2] * v;
+        accm_dF_dy += J[5] * v;
+        accm_dF_dz += J[8] * v;
+    }
+
+    dF_dx[cur_dof_offset] = accm_dF_dx;
+    dF_dy[cur_dof_offset] = accm_dF_dy;
+    dF_dz[cur_dof_offset] = accm_dF_dz;
+}
+
+void
+gpu_compute_field_derivatives(entity_data_gpu& edg,
+    const double* f, double *df_dx, double* df_dy, double* df_dz)
+{
+    const auto THREADS_PER_BLOCK = 256;
+    auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
+    if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
+        num_blocks += 1;
+
+    switch (edg.a_order)
+    {
+        case 1:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
+            break;
+
+        case 2:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<2>(ed, f, df_dx, df_dy, df_dz);
+            break;
+    
+        case 3:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<3>(ed, f, df_dx, df_dy, df_dz);
+            break;
+    #if 0
+        case 4:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<4>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<4>(ed, f, df_dx, df_dy, df_dz);
+            break;
+        
+        case 5:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<5>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<5>(ed, f, df_dx, df_dy, df_dz);
+            break;
+
+        case 6:
+            if (edg.g_order == 1)
+                gpu_compute_field_derivatives_kernel_planar<6>(edg, f, df_dx, df_dy, df_dz);
+            //else
+            //    compute_field_derivatives_kernel_curved<6>(ed, f, df_dx, df_dy, df_dz);
+            break;
+#endif
+
+        default:
+            std::cout << "compute_field_derivatives: invalid order" << std::endl;
+    }
+}
\ No newline at end of file
--- a/src/kernels_gpu_mi.cpp
+++ b/src/kernels_gpu_mi.cpp
+#include "kernels_gpu.h"
+
+/* Return how many dblocks are needed to store this entity on GPU. */
+size_t gpu_dblocks(const entity_data_cpu& ed)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+
+    size_t tot_elems = num_elems_all_orientations(ed);
+    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
+    if (num_dblocks % ks.cells_per_dblock)
+        num_dblocks += 1;
+
+    return num_dblocks;
+}
+
+/* Return the total number of dofs (actual + padding) that are contained
+ * in the dblocks making up this entity. */
+size_t gpu_dblocks_dofs(const entity_data_cpu& ed)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+    size_t num_dblocks = gpu_dblocks(ed);
+    return num_dblocks * ks.dblock_size;
+}
+
+void reshape_dofs(const entity_data_cpu& ed, const entity_data_gpu& edg,
+                  const vecxd& in_field, vecxd& out_field, bool to_gpu)
+{
+    auto ks = get_kernel_gpu_sizes(ed.a_order);
+
+    size_t tot_elems = num_elems_all_orientations(ed);
+    size_t num_dblocks = tot_elems / ks.cells_per_dblock;
+    if (num_dblocks % ks.cells_per_dblock)
+        num_dblocks += 1;
+
+    for (size_t i = 0; i < num_dblocks; i++)
+    {
+        for (size_t j = 0; j < ks.cells_per_dblock; j++)
+        {
+            auto cur_elem =  ks.cells_per_dblock*i + j;
+            if (cur_elem >= tot_elems)
+                break;
+            
+            auto cpu_base = ed.dof_base + cur_elem * ed.num_bf;
+            auto gpu_base = edg.dof_base + ks.dblock_size*i + ks.num_bf*j;
+
+            if (to_gpu)
+                for (size_t k = 0; k < ks.num_bf; k++)
+                    out_field(gpu_base + k) = in_field(cpu_base + k);
+            else
+                for (size_t k = 0; k < ks.num_bf; k++)
+                    out_field(cpu_base + k) = in_field(gpu_base + k);
+        }
+    }
+}
--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -10,6 +10,8 @@
 #include "kernels_cpu.h"
 #include "timecounter.h"

+#include "kernels_gpu.h"
+
 using namespace sgr;

 static void
@@ -58,22 +60,34 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde

        auto& e0 = mod.entity_at(0);
        e0.sort_by_orientation();
-        vecxd Pf_e0 = e0.project(f);
-        vecxd df_dx_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dy_e0 = vecxd::Zero( Pf_e0.rows() );
-        vecxd df_dz_e0 = vecxd::Zero( Pf_e0.rows() );
+        vecxd Pf_cpu = e0.project(f);

+        /* Make CPU entity data */
        entity_data_cpu ed;
        e0.populate_entity_data(ed);
-
+        /* Derive GPU entity data */
        entity_data_gpu edg(ed);

-#if 0
-        timecounter tc;
+        /* Prepare I/O vectors and call kernel */
+        device_vector<double> Pf_gpu(Pf_cpu.data(), Pf_cpu.size());
+        device_vector<double> df_dx_gpu(Pf_cpu.size());
+        device_vector<double> df_dy_gpu(Pf_cpu.size());
+        device_vector<double> df_dz_gpu(Pf_cpu.size());
+
+        timecounter_gpu tc;
        tc.tic();
-        compute_field_derivatives(ed, Pf_e0, df_dx_e0, df_dy_e0, df_dz_e0);
+        gpu_compute_field_derivatives(edg, Pf_gpu.data(), df_dx_gpu.data(),
+            df_dy_gpu.data(), df_dz_gpu.data());
        double time = tc.toc();

+        vecxd df_dx_cpu = vecxd::Zero( Pf_cpu.size() );
+        vecxd df_dy_cpu = vecxd::Zero( Pf_cpu.size() );
+        vecxd df_dz_cpu = vecxd::Zero( Pf_cpu.size() );
+
+        df_dx_gpu.copyout( df_dx_cpu.data() );
+        df_dy_gpu.copyout( df_dy_cpu.data() );
+        df_dz_gpu.copyout( df_dz_cpu.data() );
+
        if (geometric_order == 1)
        {
            std::cout << "Kernel runtime: " << time << " seconds. Estimated performance: ";
@@ -101,9 +115,9 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
            auto& re = e0.cell_refelem(pe);
            matxd mass = e0.mass_matrix(iT);
            auto num_bf = re.num_basis_functions();
-            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_e0.segment(iT*num_bf, num_bf); 
-            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_e0.segment(iT*num_bf, num_bf); 
+            vecxd diff_x = Pdf_dx_e0.segment(iT*num_bf, num_bf) - df_dx_cpu.segment(iT*num_bf, num_bf); 
+            vecxd diff_y = Pdf_dy_e0.segment(iT*num_bf, num_bf) - df_dy_cpu.segment(iT*num_bf, num_bf); 
+            vecxd diff_z = Pdf_dz_e0.segment(iT*num_bf, num_bf) - df_dz_cpu.segment(iT*num_bf, num_bf); 
        
            err_x += diff_x.dot(mass*diff_x);
            err_y += diff_y.dot(mass*diff_y);
@@ -115,14 +129,12 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
        errs_x.push_back( std::sqrt(err_x) );
        errs_y.push_back( std::sqrt(err_y) );
        errs_z.push_back( std::sqrt(err_z) );
-#endif
    }

    double rate_x = 0.0;
    double rate_y = 0.0;
    double rate_z = 0.0;

-#if 0
    std::cout << Byellowfg << "rate df/dx  rate df/dy  rate df/dz" << reset << std::endl;
    for (size_t i = 1; i < sizes.size(); i++)
    {
@@ -134,7 +146,6 @@ int test_differentiation_convergence(int geometric_order, int approximation_orde
    COMPARE_VALUES_ABSOLUTE("df/dx", rate_x, double(approximation_order), 0.2);
    COMPARE_VALUES_ABSOLUTE("df/dy", rate_y, double(approximation_order), 0.2);
    COMPARE_VALUES_ABSOLUTE("df/dz", rate_z, double(approximation_order), 0.2);
-#endif

    return 0;
 }
@@ -150,8 +161,8 @@ int main(void)
    int failed_tests = 0;

    std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
-    for (size_t go = 1; go < 5; go++)
-        for (size_t ao = go; ao < 5; ao++)
+    for (size_t go = 1; go < 2; go++)
+        for (size_t ao = go; ao < 4; ao++)
            failed_tests += test_differentiation_convergence(go, ao);

    return failed_tests;