Differentiation matrix in texture.

4aff7fa6 · Matteo Cicuttin · 6f4a26c8 · 4aff7fa6 · 4aff7fa6 · 4aff7fa6
Commit 4aff7fa6 authored 3 years ago by Matteo Cicuttin
--- a/include/entity_data.h
+++ b/include/entity_data.h
@@ -69,18 +69,18 @@ num_elems_all_orientations(const entity_data_cpu& ed)
 #ifdef ENABLE_GPU_SOLVER
 struct entity_data_gpu
 {
-    int                 g_order;
-    int                 a_order;
-    size_t              num_all_elems;
-    size_t *            element_orientation;
-    size_t              num_bf;
+    int                         g_order;
+    int                         a_order;
+    size_t                      num_all_elems;
+    device_vector<int32_t>      element_orientation;
+    size_t                      num_bf;

-    size_t              dof_base;
+    size_t                      dof_base;


-    double*             differentiation_matrices;
-    double*             jacobians;
-    double*             cell_determinants;
+    texture_allocator<double>   differentiation_matrices;
+    device_vector<double>       jacobians;
+    device_vector<double>       cell_determinants;


    entity_data_gpu();

--- a/include/kernels_gpu.h
+++ b/include/kernels_gpu.h
@@ -41,6 +41,16 @@ struct kernel_gpu_sizes<3>
    static const size_t parallel_dblocks    = 1;
 };

+template<>
+struct kernel_gpu_sizes<4>
+{
+    static const size_t num_bf              = 35;
+    static const size_t cells_per_dblock    = 12;
+    static const size_t dblock_bf           = num_bf * cells_per_dblock;
+    static const size_t dblock_size         = 128;
+    static const size_t parallel_dblocks    = 1;
+};
+
 struct kernel_gpu_sizes_runtime
 {
    size_t      num_bf;

--- a/src/entity_data.cpp
+++ b/src/entity_data.cpp
@@ -2,10 +2,6 @@

 entity_data_gpu::entity_data_gpu()
 {
-    element_orientation = nullptr;
-    differentiation_matrices = nullptr;
-    jacobians = nullptr;
-    cell_determinants = nullptr;
 }

 entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
@@ -15,15 +11,15 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)

    num_all_elems = num_elems_all_orientations(ed);

-    std::vector<size_t> eo;
+    std::vector<int32_t> eo;
    eo.reserve(num_all_elems);
-    for (size_t iO = 0; iO < ed.num_orientations; iO++)
+    for (int32_t iO = 0; iO < ed.num_orientations; iO++)
    {
        for (size_t iT = 0; iT < ed.num_elems[iO]; iT++)
            eo.push_back(iO);
    }
    assert(eo.size() == num_all_elems);
-    gpu_copyin(eo, &element_orientation);
+    element_orientation.copyin(eo.data(), eo.size());

    num_bf = ed.num_bf;

@@ -44,28 +40,15 @@ entity_data_gpu::entity_data_gpu(const entity_data_cpu& ed)
            ed.differentiation_matrices.block(i*num_bf, 2*num_bf, num_bf, num_bf).transpose();
    }

-    gpu_copyin(dm, &differentiation_matrices);
+    differentiation_matrices.init(dm.data(), dm.size());

-    gpu_copyin(ed.jacobians, &jacobians);
+    jacobians.copyin(ed.jacobians.data(), ed.jacobians.size());

-    gpu_copyin(ed.cell_determinants, &cell_determinants);
+    cell_determinants.copyin(ed.cell_determinants.data(), ed.cell_determinants.size());

 }

 entity_data_gpu::~entity_data_gpu()
 {
-    /*
-    if (element_orientation)
-        checkGPU( gpuFree(element_orientation) );
-
-    if (differentiation_matrices)
-        checkGPU( gpuFree(differentiation_matrices) );
-    
-    if (jacobians)
-        checkGPU( gpuFree(jacobians) );
-
-    if (cell_determinants)
-        checkGPU( gpuFree(cell_determinants) );
-    */
 }

--- a/src/kernels_cuda.cu
+++ b/src/kernels_cuda.cu
@@ -3,117 +3,48 @@

 template<size_t K>
 __global__ void
-gpu_compute_field_derivatives_kernel_planar_CM(entity_data_gpu edg,
-    const double* F, double *dF_dx, double* dF_dy, double* dF_dz)
+gpu_deriv_planar(const double *F, const double *J, gpuTextureObject_t DM_tex,
+    double *dF_dx, double* dF_dy, double* dF_dz, int32_t num_all_elems,
+    int32_t* orients)
 {
    using KS = kernel_gpu_sizes<K>;

    int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);

-    int32_t cur_dof_offset = edg.dof_base + ofs_in_entity;
-    if (cur_dof_offset >= edg.num_all_elems*edg.num_bf)
+    int32_t cur_dof_offset = /*edg.dof_base + */ofs_in_entity;
+    if (cur_dof_offset >= num_all_elems*KS::num_bf)
        return;

    int32_t iT = cur_dof_offset / KS::num_bf;
-    int32_t elem_dof_base = edg.dof_base + iT*edg.num_bf;
-    int32_t iO = edg.element_orientation[iT];
-    int32_t DM_row = ofs_in_entity % KS::num_bf;
-    int32_t DM_orient = 3*KS::num_bf*iO*KS::num_bf;
-    double *DM = edg.differentiation_matrices + DM_orient;
-
-    double accm_dF_dx = 0.0;
-    double accm_dF_dy = 0.0;
-    double accm_dF_dz = 0.0;
-
-    int32_t jac_ofs = 9*iT;
-    double J[9];
-    for (int32_t i = 0; i < 9; i++)
-        J[i] = edg.jacobians[jac_ofs+i];
-
-    for (int32_t dof = 0; dof < KS::num_bf; dof++)
-    {
-        int32_t d_ofs = DM_row + KS::num_bf*dof;
-        int32_t f_ofs = elem_dof_base + dof;
-        double v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[0] * v;
-        accm_dF_dy += J[3] * v;
-        accm_dF_dz += J[6] * v;
-    }
-
-    for (int32_t dof = 0; dof < KS::num_bf; dof++)
-    {
-        int32_t d_ofs = DM_row + KS::num_bf*KS::num_bf + KS::num_bf*dof;
-        int32_t f_ofs = elem_dof_base + dof;
-        double v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[1] * v;
-        accm_dF_dy += J[4] * v;
-        accm_dF_dz += J[7] * v;
-    }
-
-    for (int32_t dof = 0; dof < KS::num_bf; dof++)
-    {
-        int32_t d_ofs = DM_row + 2*KS::num_bf*KS::num_bf + KS::num_bf*dof;
-        int32_t f_ofs = elem_dof_base + dof;
-        double v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[2] * v;
-        accm_dF_dy += J[5] * v;
-        accm_dF_dz += J[8] * v;
-    }
-
-    dF_dx[cur_dof_offset] = accm_dF_dx;
-    dF_dy[cur_dof_offset] = accm_dF_dy;
-    dF_dz[cur_dof_offset] = accm_dF_dz;
-}
-
-template<size_t K>
-__global__ void
-gpu_compute_field_derivatives_kernel_planar(entity_data_gpu edg,
-    const double* F, double *dF_dx, double* dF_dy, double* dF_dz)
-{
-    using KS = kernel_gpu_sizes<K>;
-
-    int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
-
-    int32_t cur_dof_offset = edg.dof_base + ofs_in_entity;
-    if (cur_dof_offset >= edg.num_all_elems*edg.num_bf)
-        return;
-
-    int32_t iT = cur_dof_offset / KS::num_bf;
-    int32_t elem_dof_base = edg.dof_base + iT*edg.num_bf;
-    int32_t iO = edg.element_orientation[iT];
+    int32_t elem_dof_base = /*edg.dof_base + */iT*KS::num_bf;
+    int32_t iO = orients[iT];
    int32_t DM_row = ofs_in_entity % KS::num_bf;
    int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
-    double *DM = edg.differentiation_matrices + DM_orient;

    double accm_dF_dx = 0.0;
    double accm_dF_dy = 0.0;
    double accm_dF_dz = 0.0;
-
    int32_t jac_ofs = 9*iT;
-    double J[9];
-    for (int32_t i = 0; i < 9; i++)
-        J[i] = edg.jacobians[jac_ofs+i];
-
    for (int32_t dof = 0; dof < KS::num_bf; dof++)
    {
-        int32_t d_ofs = DM_row + 3*KS::num_bf*dof;
+        int32_t d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof;
        int32_t f_ofs = elem_dof_base + dof;
-        double v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[0] * v;
-        accm_dF_dy += J[3] * v;
-        accm_dF_dz += J[6] * v;
+        double v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];
+        accm_dF_dx += J[jac_ofs+0] * v;
+        accm_dF_dy += J[jac_ofs+3] * v;
+        accm_dF_dz += J[jac_ofs+6] * v;
    
-        d_ofs = DM_row + 3*KS::num_bf*dof + KS::num_bf;
-        v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[1] * v;
-        accm_dF_dy += J[4] * v;
-        accm_dF_dz += J[7] * v;
+        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];
+        accm_dF_dx += J[jac_ofs+1] * v;
+        accm_dF_dy += J[jac_ofs+4] * v;
+        accm_dF_dz += J[jac_ofs+7] * v;
    
-        d_ofs = DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
-        v = DM[d_ofs] * F[f_ofs];
-        accm_dF_dx += J[2] * v;
-        accm_dF_dy += J[5] * v;
-        accm_dF_dz += J[8] * v;
+        d_ofs = DM_orient + DM_row + 3*KS::num_bf*dof + 2*KS::num_bf;
+        v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];
+        accm_dF_dx += J[jac_ofs+2] * v;
+        accm_dF_dy += J[jac_ofs+5] * v;
+        accm_dF_dz += J[jac_ofs+8] * v;
    }

    dF_dx[cur_dof_offset] = accm_dF_dx;
@@ -130,36 +61,46 @@ gpu_compute_field_derivatives(entity_data_gpu& edg,
    if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
        num_blocks += 1;

+    auto J = edg.jacobians.data();
+    auto Dtex = edg.differentiation_matrices.get_texture();
+    int32_t num_elems = edg.num_all_elems;
+    auto orients = edg.element_orientation.data();
+    auto num_orients = edg.element_orientation.size();
+
    switch (edg.a_order)
    {
        case 1:
            if (edg.g_order == 1)
-                gpu_compute_field_derivatives_kernel_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+                gpu_deriv_planar<1><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
+                    Dtex, df_dx, df_dy, df_dz, num_elems, orients);
            //else
            //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
            break;

        case 2:
            if (edg.g_order == 1)
-                gpu_compute_field_derivatives_kernel_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+                gpu_deriv_planar<2><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
+                    Dtex, df_dx, df_dy, df_dz, num_elems, orients);
            //else
            //    compute_field_derivatives_kernel_curved<2>(ed, f, df_dx, df_dy, df_dz);
            break;
    
        case 3:
            if (edg.g_order == 1)
-                gpu_compute_field_derivatives_kernel_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(edg, f, df_dx, df_dy, df_dz);
+                gpu_deriv_planar<3><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
+                    Dtex, df_dx, df_dy, df_dz, num_elems, orients);
            //else
            //    compute_field_derivatives_kernel_curved<3>(ed, f, df_dx, df_dy, df_dz);
            break;
-    #if 0
+
        case 4:
            if (edg.g_order == 1)
-                gpu_compute_field_derivatives_kernel_planar<4>(edg, f, df_dx, df_dy, df_dz);
+                gpu_deriv_planar<4><<<num_blocks, THREADS_PER_BLOCK>>>(f, J,
+                    Dtex, df_dx, df_dy, df_dz, num_elems, orients);
            //else
            //    compute_field_derivatives_kernel_curved<4>(ed, f, df_dx, df_dy, df_dz);
            break;
-        
+#if 0        
        case 5:
            if (edg.g_order == 1)
                gpu_compute_field_derivatives_kernel_planar<5>(edg, f, df_dx, df_dy, df_dz);

--- a/tests/test_differentiation_gpu.cpp
+++ b/tests/test_differentiation_gpu.cpp
@@ -162,7 +162,7 @@ int main(void)

    std::cout << Bmagentafg << " *** TESTING: DIFFERENTIATION ***" << reset << std::endl;
    for (size_t go = 1; go < 2; go++)
-        for (size_t ao = go; ao < 4; ao++)
+        for (size_t ao = go; ao < 5; ao++)
            failed_tests += test_differentiation_convergence(go, ao);

    return failed_tests;