finite_differences.cu

#include <iostream>
#include <fstream>
#include <vector>
#include <cstdio>
#include <cassert>
#include <silo.h>

#define DEBUG

#define SINGLE_PRECISION

template<typename T>
class fd_grid
{
    std::vector<T>  m_data;
    size_t m_rows, m_cols;

public:
    fd_grid()
    {}

    fd_grid(size_t rows, size_t cols) :
        m_data(rows*cols), m_rows(rows), m_cols(cols)
    {}

    fd_grid(fd_grid&& other)
    {
        std::swap(m_data, other.m_data);
    }

    fd_grid& operator=(fd_grid&& other)
    {
        std::swap(m_data, other.m_data);
        return *this;
    }

    T operator()(size_t i, size_t j) const
    {
        return m_data[i*m_cols + j];
    }

    T& operator()(size_t i, size_t j)
    {
        return m_data[i*m_cols + j];
    }

    size_t rows() const { return m_rows; }
    size_t cols() const { return m_cols; }

    T* data() { return m_data.data(); }
    const T* data() const { return m_data.data(); }
};

template<typename T>
int
visit_dump(const fd_grid<T>& g, const std::string& fn)
{
    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");

    DBfile *db = nullptr;
    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
    if (!db)
    {
        std::cout << "Cannot write simulation output" << std::endl;
        return -1;
    }

    auto size_x = g.cols();
    auto size_y = g.rows();

    std::vector<T> x(size_x);
    std::vector<T> y(size_y);

    for (size_t i = 0; i < size_x; i++)
        x.at(i) = T(i)/(size_x-1);

    for (size_t i = 0; i < size_y; i++)
        y.at(i) = T(i)/(size_y-1);

    int dims[] = { int(size_y), int(size_y) };
    int ndims = 2;

    T *coords[] = {x.data(), y.data()};

    if (std::is_same<T,float>::value)
    {
        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);
        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
    }
    else
    {
        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
    }

    DBClose(db);
    return 0;
}


inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}

#ifdef SINGLE_PRECISION
__constant__ float d_w[9];
#else
__constant__ double d_w[9];
#endif

template<typename T>
__global__ void test_kernel(T* data)
{
    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;

    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;

    data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
}


#define     GRID_ROWS       32
#define     GRID_COLS       32
#define     HALO_SIZE       4

int grid_rows;
int grid_cols;

template<typename T>
__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
    __shared__ T s_curr[GRID_COLS+2*HALO_SIZE];

    /* Get X/Y coordinates of this grid point */
    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;

    /* Get offset of this grid point in the fd grid array */
    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;

    int shm_ofs = threadIdx.x + HALO_SIZE;

    s_curr[shm_ofs] = u_curr[ofs];

    __syncthreads();


    if (threadIdx.x < 4)
    {
        s_curr[threadIdx.x] = 0.0;
        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
    }

    __syncthreads();

    T dx = 0.0;
    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];

    u_next[ofs] = dx;
}

#if 0
template<typename T>
__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
    __shared__ T s_curr[2*HALO_SIZE][32];

    /* Get X/Y coordinates of this grid point */
    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;

    /* Get offset of this grid point in the fd grid array */
    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;

    int shm_ofs_x = threadIdx.x;
    int shm_ofs_y = threadIdx.y;

    s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];

    __syncthreads();


    if (threadIdx.y < 4)
    {
        s_curr[threadIdx.x] = 0.0;
        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
    }

    __syncthreads();

    T dx = 0.0;
    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];

    u_next[ofs] = dx;
}
#endif
int main(void)
{

#ifdef SINGLE_PRECISION
    using T = float;
#else
    using T = double;
#endif

    /**** Get device properties ****/
    cudaDeviceProp prop;
    checkCuda( cudaGetDeviceProperties(&prop, 0) );
    printf("Device Name: %s\n", prop.name);
    printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);

    /**** Initialize constants ****/
    T w0 = -205.0/72.0;
    T w1 = 8.0/5.0;
    T w2 = -1.0/5.0;
    T w3 = 8.0/315.0;
    T w4 = -1.0/560.0;
    T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
    checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );

    /**** Prepare for timing ****/
    float milliseconds;
    cudaEvent_t startEvent, stopEvent;
    checkCuda( cudaEventCreate(&startEvent) );
    checkCuda( cudaEventCreate(&stopEvent) );

    /**** Prepare grid and copy to device ****/
    fd_grid<T> g(GRID_ROWS, GRID_COLS);
    for (size_t i = 0; i < GRID_ROWS; i++)
        for (size_t j = 0; j < GRID_COLS; j++)
            g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
            //g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);

    T* u_curr;
    T* u_next;

    checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
    checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
    checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );

    dim3 gs(1,GRID_ROWS);
    dim3 tpb(GRID_COLS,1);

    checkCuda( cudaEventRecord(startEvent, 0) );
    //for (size_t i = 0; i < 1; i++)
        x_kernel<<<gs, tpb>>>(u_curr, u_next);
    checkCuda( cudaEventRecord(stopEvent, 0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
    checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );

    std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;

    checkCuda( cudaDeviceSynchronize() );

    checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );

    checkCuda( cudaFree(u_next) );
    checkCuda( cudaFree(u_curr) );

    visit_dump(g, "cuda.silo");

    return 0;
}