Skip to content
Snippets Groups Projects
Select Git revision
  • b11f0a362fcc55ea64b9a9b18efc34a6861811d0
  • master default
  • cgnsUnstructured
  • partitioning
  • poppler
  • HighOrderBLCurving
  • gmsh_3_0_4
  • gmsh_3_0_3
  • gmsh_3_0_2
  • gmsh_3_0_1
  • gmsh_3_0_0
  • gmsh_2_16_0
  • gmsh_2_15_0
  • gmsh_2_14_1
  • gmsh_2_14_0
  • gmsh_2_13_2
  • gmsh_2_13_1
  • gmsh_2_12_0
  • gmsh_2_11_0
  • gmsh_2_10_1
  • gmsh_2_10_0
  • gmsh_2_9_3
  • gmsh_2_9_2
  • gmsh_2_9_1
  • gmsh_2_9_0
  • gmsh_2_8_6
26 results

gmsh.texi

Blame
  • Forked from gmsh / gmsh
    Source project has a limited visibility.
    kernels_cuda.cu 14.37 KiB
    #include "entity_data.h"
    #include "kernels_gpu.h"
    
    /* compute α∇F */
    template<size_t K>
    __global__ void
    gpu_deriv_planar(const double *F, const double * __restrict__ J,
        gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
        double * __restrict__ dF_dy, double * __restrict__ dF_dz,
        double alpha, int32_t num_all_elems, const int32_t* orients,
        int32_t dof_base)
    {
        using KS = kernel_gpu_sizes<K>;
    
        const int32_t ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
        if (ofs_in_entity >= num_all_elems*KS::num_bf)
            return;
    
        const int32_t output_dof_offset = dof_base + ofs_in_entity;
    
        const int32_t iT = ofs_in_entity / KS::num_bf;
        const int32_t elem_dof_base = dof_base + iT*KS::num_bf;
        const int32_t iO = orients[iT];
        const int32_t DM_row = ofs_in_entity % KS::num_bf;
        const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
        const int32_t DM_base = DM_orient + DM_row;
        const int32_t jac_ofs = 9*iT;
    
        double accm_dF_dx = 0.0;
        double accm_dF_dy = 0.0;
        double accm_dF_dz = 0.0;
        for (int32_t dof = 0; dof < KS::num_bf; dof++)
        {
            int32_t d_ofs = DM_base + 3*KS::num_bf*dof;
            int32_t f_ofs = elem_dof_base + dof;
            double v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+0] * v;
            accm_dF_dy += J[jac_ofs+3] * v;
            accm_dF_dz += J[jac_ofs+6] * v;
        
            d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf;
            v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+1] * v;
            accm_dF_dy += J[jac_ofs+4] * v;
            accm_dF_dz += J[jac_ofs+7] * v;
        
            d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf;
            v = fetch_tex(DM_tex, d_ofs) * F[f_ofs];//fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+2] * v;
            accm_dF_dy += J[jac_ofs+5] * v;
            accm_dF_dz += J[jac_ofs+8] * v;
        }
    
        dF_dx[output_dof_offset] = alpha*accm_dF_dx;
        dF_dy[output_dof_offset] = alpha*accm_dF_dy;
        dF_dz[output_dof_offset] = alpha*accm_dF_dz;
    }
    
    template<size_t K>
    __global__ void
    gpu_deriv_planar_blocked(gpuTextureObject_t F, const double * __restrict__ J,
        gpuTextureObject_t DM_tex, double * __restrict__ dF_dx,
        double * __restrict__ dF_dy, double * __restrict__ dF_dz,
        int32_t num_all_elems, int32_t* orients, int32_t dof_base)
    {
        using KS = kernel_gpu_sizes<K>;
    
        if (threadIdx.x >= KS::dofs_per_dblock)
            return;
        const int32_t y_idx = (blockIdx.y * blockDim.y + threadIdx.y);
        const int32_t elem_base = y_idx * KS::cells_per_dblock;
        const int32_t elem_ofs = threadIdx.x / KS::num_bf;
        const int32_t iT = elem_base + elem_ofs;
    
        if (iT >= num_all_elems)
            return;
        
        const int32_t block_dof_base = y_idx * KS::dblock_size;
    
        const int32_t output_dof_offset = dof_base + block_dof_base + threadIdx.x;
    
        const int32_t elem_dof_base = dof_base + block_dof_base + elem_ofs*KS::num_bf;
        const int32_t iO = orients[iT];
        const int32_t DM_row = threadIdx.x % KS::num_bf;
        const int32_t DM_orient = 3*KS::num_bf*KS::num_bf*iO;
        const int32_t DM_base = DM_orient + DM_row;
        const int32_t jac_ofs = 9*iT;
    
        double accm_dF_dx = 0.0;
        double accm_dF_dy = 0.0;
        double accm_dF_dz = 0.0;
        for (int32_t dof = 0; dof < KS::num_bf; dof++)
        {
            int32_t d_ofs = DM_base + 3*KS::num_bf*dof;
            int32_t f_ofs = elem_dof_base + dof;
            double v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+0] * v;
            accm_dF_dy += J[jac_ofs+3] * v;
            accm_dF_dz += J[jac_ofs+6] * v;
        
            d_ofs = DM_base + 3*KS::num_bf*dof + KS::num_bf;
            v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+1] * v;
            accm_dF_dy += J[jac_ofs+4] * v;
            accm_dF_dz += J[jac_ofs+7] * v;
        
            d_ofs = DM_base + 3*KS::num_bf*dof + 2*KS::num_bf;
            v = fetch_tex(DM_tex, d_ofs) * fetch_tex(F, f_ofs);
            accm_dF_dx += J[jac_ofs+2] * v;
            accm_dF_dy += J[jac_ofs+5] * v;
            accm_dF_dz += J[jac_ofs+8] * v;
        }
    
        dF_dx[output_dof_offset] = accm_dF_dx;
        dF_dy[output_dof_offset] = accm_dF_dy;
        dF_dz[output_dof_offset] = accm_dF_dz;
    }
    
    template<size_t K>
    void
    launch_deriv_kernel(const entity_data_gpu& edg,
        const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha)
    {
        auto J = edg.jacobians.data();
        auto Dtex = edg.differentiation_matrices.get_texture();
        int32_t num_elems = edg.num_all_elems;
        auto orients = edg.element_orientation.data();
        auto num_orients = edg.element_orientation.size();
    
        using KS = kernel_gpu_sizes<K>;
    
    #ifdef USE_BLOCKED_GPU_KERNELS
        size_t num_blocks = edg.num_all_elems / (KS::cells_per_dblock * KS::parallel_dblocks);
        if (num_blocks % (KS::cells_per_dblock * KS::parallel_dblocks))
            num_blocks += 1;
    
        dim3 grid_size(1, num_blocks);
        dim3 block_size(KS::dblock_size, KS::parallel_dblocks);
        if (edg.g_order == 1)
            gpu_deriv_planar_blocked<K><<<grid_size, block_size>>>(f, J,
                Dtex, df_dx, df_dy, df_dz, num_elems, orients, edg.dof_base);
        //else
        //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
    #else
        auto num_blocks = edg.num_bf*edg.num_all_elems/KS::deriv_threads;
        if (edg.num_bf*edg.num_all_elems % KS::deriv_threads)
            num_blocks += 1;
    
        if (edg.g_order == 1)
            gpu_deriv_planar<K><<<num_blocks, KS::deriv_threads>>>(f, J,
                Dtex, df_dx, df_dy, df_dz, alpha, num_elems, orients, edg.dof_base);
        //else
        //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
    #endif
    }
    
    void
    gpu_compute_field_derivatives(const entity_data_gpu& edg,
        const double *f, double *df_dx, double* df_dy, double* df_dz, double alpha)
    {
        
    
        switch (edg.a_order)
        {
            case 1:
                launch_deriv_kernel<1>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
    
            case 2:
                launch_deriv_kernel<2>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
        
            case 3:
                launch_deriv_kernel<3>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
    
            case 4:
                launch_deriv_kernel<4>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
          
            case 5:
                launch_deriv_kernel<5>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
    
            case 6:
                launch_deriv_kernel<6>(edg, f, df_dx, df_dy, df_dz, alpha);
                break;
    
            default:
                throw std::invalid_argument("compute_field_derivatives: invalid order");
        }
    }
    
    template<size_t K>
    __global__ void
    gpu_lift_planar(const double *flux, gpuTextureObject_t LM_tex,
        const double * __restrict__ dets, double * __restrict__ lifted_flux,
        int32_t num_all_elems, int32_t* orients, int32_t dof_base, int32_t flux_base)
    {
        /* This kernel saturates the texture cache bandwidth (~1 TB/s on K20x!!)
         * and thus it does not achieve good performance. This happens because
         * the lifting matrix entries are not reused sufficiently. Sufficient
         * reuse should be achieved by using a single entry to multiply different
         * rhs instead of only one at a time. Orientations however make the
         * implementation complicated, so for now the kernel remains slow. */
        using KS = kernel_gpu_sizes<K>;
    
        /* One thread per *output* dof */
        int ofs_in_entity = (blockIdx.x * blockDim.x + threadIdx.x);
        if (ofs_in_entity >= num_all_elems*KS::num_bf)
            return;
    
        int32_t cur_dof_offset = dof_base + ofs_in_entity;
    
        int32_t iT = ofs_in_entity / KS::num_bf;
        int32_t elem_flux_base = flux_base + 4*iT*KS::num_fluxes;
        int32_t iO = orients[iT];
        int32_t LM_row = ofs_in_entity % KS::num_bf;
        int32_t LM_orient = 4*KS::num_bf*KS::num_fluxes*iO;
        double inv_det = 1./dets[iT];
    
        int32_t delta = ofs_in_entity % KS::num_bf;
    
        double acc = 0.0;
        for (int32_t dof = 0; dof < 4*KS::num_fluxes; dof++)
        {
            int32_t l_ofs = LM_orient + LM_row + KS::num_bf*dof;
            int32_t f_ofs = elem_flux_base + dof;//(dof+delta)%(4*KS::num_fluxes);
            acc += inv_det * fetch_tex(LM_tex, l_ofs) * flux[f_ofs];//fetch_tex(flux, f_ofs);
        }
    
        lifted_flux[cur_dof_offset] += acc;
    }
    
    template<size_t K>
    void
    launch_lift_kernel(entity_data_gpu& edg, const double *f, double *out)
    {
        const auto THREADS_PER_BLOCK = 128;//kernel_gpu_sizes<K>::deriv_threads;
        auto num_blocks = edg.num_bf*edg.num_all_elems/THREADS_PER_BLOCK;
        if (edg.num_bf*edg.num_all_elems % THREADS_PER_BLOCK)
            num_blocks += 1;
    
        auto Ltex = edg.lifting_matrices.get_texture();
        int32_t num_elems = edg.num_all_elems;
        auto orients = edg.element_orientation.data();
        auto num_orients = edg.element_orientation.size();
        auto dets = edg.cell_determinants.data();
    
        if (edg.g_order == 1)
            gpu_lift_planar<K><<<num_blocks, THREADS_PER_BLOCK>>>(f, Ltex,
                dets, out, num_elems, orients, edg.dof_base, edg.flux_base);
        //else
        //    compute_field_derivatives_kernel_curved<1>(ed, f, df_dx, df_dy, df_dz);
    }
    
    void
    gpu_compute_flux_lifting(entity_data_gpu& edg, const double *f, double *out)
    {
        switch (edg.a_order)
        {
            case 1:
                launch_lift_kernel<1>(edg, f, out);
                break;
    
            case 2:
                launch_lift_kernel<2>(edg, f, out);
                break;
        
            case 3:
                launch_lift_kernel<3>(edg, f, out);
                break;
    
            case 4:
                launch_lift_kernel<4>(edg, f, out);
                break;
          
            case 5:
                launch_lift_kernel<5>(edg, f, out);
                break;
    
            case 6:
                launch_lift_kernel<6>(edg, f, out);
                break;
    
            default:
                throw std::invalid_argument("compute_field_derivatives: invalid order");
        }
    }
    
    void __global__
    gpu_subtract_kernel(double * __restrict__ dst, const double * __restrict__ add,
        const double * __restrict__ sub, size_t num_dofs)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            dst[dof] = add[dof] - sub[dof];
    }
    
    void
    gpu_subtract(double *dst, const double *add, const double *sub, size_t num_dofs)
    {
        static const size_t SUBTRACT_THREADS = 256;
        auto gs = num_dofs/SUBTRACT_THREADS;
        if (num_dofs % SUBTRACT_THREADS != 0)
            gs += 1;
        
        gpu_subtract_kernel<<<gs, SUBTRACT_THREADS>>>(dst, add, sub, num_dofs);
    }
    
    
    void __global__
    gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
        const double * __restrict__ k, size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = y[dof] + dt*k[dof];
    }
    
    void __global__
    gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
        const double * __restrict__ k, const double * __restrict__ material,
        size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = y[dof] + dt*k[dof]*material[dof];
    }
    
    void
    gpu_compute_euler_update(double *out, const double *y, const double *k,
        const double *material, size_t num_dofs, double dt)
    {
        static const size_t EULER_THREADS = 256;
    
        auto gs = num_dofs/EULER_THREADS;
        if (num_dofs % EULER_THREADS != 0)
            gs += 1;
    
        gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, material, num_dofs, dt);
    }
    
    void __global__
    gpu_euler_update_kernel(double * __restrict__ out, const double * __restrict__ y,
        const double * __restrict__ k, const double * __restrict__ reaction,
        const double * __restrict__ material, size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = y[dof]*(1-dt*reaction[dof]) + dt*k[dof]*material[dof];
    }
    
    void
    gpu_compute_euler_update(double *out, const double *y, const double *k,
        const double *reaction, const double *material, size_t num_dofs, double dt)
    {
        static const size_t EULER_THREADS = 256;
    
        auto gs = num_dofs/EULER_THREADS;
        if (num_dofs % EULER_THREADS != 0)
            gs += 1;
    
        gpu_euler_update_kernel<<<gs, EULER_THREADS>>>(out, y, k, reaction, material, num_dofs, dt);
    }
    
    void __global__
    gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
        const double * __restrict__ k1, const double * __restrict__ k2,
        const double * __restrict__ k3, const double * __restrict__ k4,
        size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = in[dof] + dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
    }
    
    void __global__
    gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
        const double * __restrict__ k1, const double * __restrict__ k2,
        const double * __restrict__ k3, const double * __restrict__ k4,
        const double * __restrict__ material,
        size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = in[dof] + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
    }
    
    void
    gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
        const double *k2, const double *k3, const double *k4,
        const double *material, size_t num_dofs, double dt)
    {
        static const size_t RK4_THREADS = 256;
    
        auto gs = num_dofs/RK4_THREADS;
        if (num_dofs % RK4_THREADS != 0)
            gs += 1;
    
            gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
                material, num_dofs, dt);
    }
    
    void __global__
    gpu_rk4_weighted_sum_kernel(double * __restrict__ out, const double * __restrict__ in,
        const double * __restrict__ k1, const double * __restrict__ k2,
        const double * __restrict__ k3, const double * __restrict__ k4,
        const double * __restrict__ reaction, const double * __restrict__ material,
        size_t num_dofs, double dt)
    {
        int32_t dof = blockIdx.x * blockDim.x + threadIdx.x;
        if (dof < num_dofs)
            out[dof] = in[dof]*(1-dt*reaction[dof]) + material[dof]*dt*(k1[dof] + 2*k2[dof] + 2*k3[dof] + k4[dof])/6;
    }
    
    void
    gpu_compute_rk4_weighted_sum(double *out, const double *in, const double *k1,
        const double *k2, const double *k3, const double *k4, const double *reaction,
        const double *material, size_t num_dofs, double dt)
    {
        static const size_t RK4_THREADS = 256;
    
        auto gs = num_dofs/RK4_THREADS;
        if (num_dofs % RK4_THREADS != 0)
            gs += 1;
    
            gpu_rk4_weighted_sum_kernel<<<gs, RK4_THREADS>>>(out, in, k1, k2, k3, k4,
                reaction, material, num_dofs, dt);
    }