Skip to content
Snippets Groups Projects
Select Git revision
  • 808ea42a60d954e64229208679a0ca4bea0afc6d
  • master default
  • cgnsUnstructured
  • partitioning
  • poppler
  • HighOrderBLCurving
  • gmsh_3_0_4
  • gmsh_3_0_3
  • gmsh_3_0_2
  • gmsh_3_0_1
  • gmsh_3_0_0
  • gmsh_2_16_0
  • gmsh_2_15_0
  • gmsh_2_14_1
  • gmsh_2_14_0
  • gmsh_2_13_2
  • gmsh_2_13_1
  • gmsh_2_12_0
  • gmsh_2_11_0
  • gmsh_2_10_1
  • gmsh_2_10_0
  • gmsh_2_9_3
  • gmsh_2_9_2
  • gmsh_2_9_1
  • gmsh_2_9_0
  • gmsh_2_8_6
26 results

Colors.h

Blame
  • Forked from gmsh / gmsh
    Source project has a limited visibility.
    finite_differences.cu 6.69 KiB
    #include <iostream>
    #include <fstream>
    #include <vector>
    #include <cstdio>
    #include <cassert>
    #include <silo.h>
    
    #define DEBUG
    
    #define SINGLE_PRECISION
    
    template<typename T>
    class fd_grid
    {
        std::vector<T>  m_data;
        size_t m_rows, m_cols;
    
    public:
        fd_grid()
        {}
    
        fd_grid(size_t rows, size_t cols) :
            m_data(rows*cols), m_rows(rows), m_cols(cols)
        {}
    
        fd_grid(fd_grid&& other)
        {
            std::swap(m_data, other.m_data);
        }
    
        fd_grid& operator=(fd_grid&& other)
        {
            std::swap(m_data, other.m_data);
            return *this;
        }
    
        T operator()(size_t i, size_t j) const
        {
            return m_data[i*m_cols + j];
        }
    
        T& operator()(size_t i, size_t j)
        {
            return m_data[i*m_cols + j];
        }
    
        size_t rows() const { return m_rows; }
        size_t cols() const { return m_cols; }
    
        T* data() { return m_data.data(); }
        const T* data() const { return m_data.data(); }
    };
    
    template<typename T>
    int
    visit_dump(const fd_grid<T>& g, const std::string& fn)
    {
        static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
    
        DBfile *db = nullptr;
        db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
        if (!db)
        {
            std::cout << "Cannot write simulation output" << std::endl;
            return -1;
        }
    
        auto size_x = g.cols();
        auto size_y = g.rows();
        
        std::vector<T> x(size_x);
        std::vector<T> y(size_y);
        
        for (size_t i = 0; i < size_x; i++)
            x.at(i) = T(i)/(size_x-1);
            
        for (size_t i = 0; i < size_y; i++)
            y.at(i) = T(i)/(size_y-1);
            
        int dims[] = { int(size_y), int(size_y) };
        int ndims = 2;
        
        T *coords[] = {x.data(), y.data()};
        
        if (std::is_same<T,float>::value)
        {
            DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);    
            DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
        }
        else
        {
            DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);    
            DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
        }
    
        DBClose(db);
        return 0;
    }
    
    
    
    inline cudaError_t checkCuda(cudaError_t result)
    {
    #if defined(DEBUG) || defined(_DEBUG)
      if (result != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
        assert(result == cudaSuccess);
      }
    #endif
      return result;
    }
    
    #ifdef SINGLE_PRECISION
    __constant__ float d_w[9];
    #else
    __constant__ double d_w[9];
    #endif
    
    template<typename T>
    __global__ void test_kernel(T* data)
    {
        int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
        int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
    
        int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
    
        data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
    }
    
    
    #define     GRID_ROWS       32
    #define     GRID_COLS       32
    #define     HALO_SIZE       4
    
    int grid_rows;
    int grid_cols;
    
    template<typename T>
    __global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
    {
        __shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
    
        /* Get X/Y coordinates of this grid point */
        int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
        int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
    
        /* Get offset of this grid point in the fd grid array */
        int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
    
        int shm_ofs = threadIdx.x + HALO_SIZE;
    
        s_curr[shm_ofs] = u_curr[ofs];
    
        __syncthreads();
        
        
        if (threadIdx.x < 4)
        {
            s_curr[threadIdx.x] = 0.0;
            s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
        }
    
        __syncthreads();
        
        T dx = 0.0;
        for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
            dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
    
        u_next[ofs] = dx;
    }
    
    #if 0
    template<typename T>
    __global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
    {
        __shared__ T s_curr[2*HALO_SIZE][32];
    
        /* Get X/Y coordinates of this grid point */
        int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
        int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
    
        /* Get offset of this grid point in the fd grid array */
        int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
    
        int shm_ofs_x = threadIdx.x;
        int shm_ofs_y = threadIdx.y;
    
        s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
    
        __syncthreads();
        
        
        if (threadIdx.y < 4)
        {
            s_curr[threadIdx.x] = 0.0;
            s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
        }
    
        __syncthreads();
        
        T dx = 0.0;
        for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
            dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
    
        u_next[ofs] = dx;
    }
    #endif
    int main(void)
    {
    
    #ifdef SINGLE_PRECISION
        using T = float;
    #else
        using T = double;
    #endif
    
        /**** Get device properties ****/
        cudaDeviceProp prop;
        checkCuda( cudaGetDeviceProperties(&prop, 0) );
        printf("Device Name: %s\n", prop.name);
        printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
    
        /**** Initialize constants ****/
        T w0 = -205.0/72.0;
        T w1 = 8.0/5.0;
        T w2 = -1.0/5.0;
        T w3 = 8.0/315.0;
        T w4 = -1.0/560.0;
        T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
        checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
    
        /**** Prepare for timing ****/
        float milliseconds;
        cudaEvent_t startEvent, stopEvent;
        checkCuda( cudaEventCreate(&startEvent) );
        checkCuda( cudaEventCreate(&stopEvent) );
    
        /**** Prepare grid and copy to device ****/
        fd_grid<T> g(GRID_ROWS, GRID_COLS);
        for (size_t i = 0; i < GRID_ROWS; i++)
            for (size_t j = 0; j < GRID_COLS; j++)
                g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
                //g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
    
        T* u_curr;
        T* u_next;
    
        checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
        checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
        checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
    
        dim3 gs(1,GRID_ROWS);
        dim3 tpb(GRID_COLS,1);
    
        checkCuda( cudaEventRecord(startEvent, 0) );
        //for (size_t i = 0; i < 1; i++)
            x_kernel<<<gs, tpb>>>(u_curr, u_next);
        checkCuda( cudaEventRecord(stopEvent, 0) );
        checkCuda( cudaEventSynchronize(stopEvent) );
        checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
    
        std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
    
        checkCuda( cudaDeviceSynchronize() );
    
        checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );
        
        checkCuda( cudaFree(u_next) );
        checkCuda( cudaFree(u_curr) );
    
        visit_dump(g, "cuda.silo");
    
        return 0;
    }