Select Git revision
finite_differences.cu
finite_differences.cu 6.69 KiB
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdio>
#include <cassert>
#include <silo.h>
#define DEBUG
#define SINGLE_PRECISION
template<typename T>
class fd_grid
{
std::vector<T> m_data;
size_t m_rows, m_cols;
public:
fd_grid()
{}
fd_grid(size_t rows, size_t cols) :
m_data(rows*cols), m_rows(rows), m_cols(cols)
{}
fd_grid(fd_grid&& other)
{
std::swap(m_data, other.m_data);
}
fd_grid& operator=(fd_grid&& other)
{
std::swap(m_data, other.m_data);
return *this;
}
T operator()(size_t i, size_t j) const
{
return m_data[i*m_cols + j];
}
T& operator()(size_t i, size_t j)
{
return m_data[i*m_cols + j];
}
size_t rows() const { return m_rows; }
size_t cols() const { return m_cols; }
T* data() { return m_data.data(); }
const T* data() const { return m_data.data(); }
};
template<typename T>
int
visit_dump(const fd_grid<T>& g, const std::string& fn)
{
static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = g.cols();
auto size_y = g.rows();
std::vector<T> x(size_x);
std::vector<T> y(size_y);
for (size_t i = 0; i < size_x; i++)
x.at(i) = T(i)/(size_x-1);
for (size_t i = 0; i < size_y; i++)
y.at(i) = T(i)/(size_y-1);
int dims[] = { int(size_y), int(size_y) };
int ndims = 2;
T *coords[] = {x.data(), y.data()};
if (std::is_same<T,float>::value)
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
}
else
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
}
DBClose(db);
return 0;
}
inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
#ifdef SINGLE_PRECISION
__constant__ float d_w[9];
#else
__constant__ double d_w[9];
#endif
template<typename T>
__global__ void test_kernel(T* data)
{
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
}
#define GRID_ROWS 32
#define GRID_COLS 32
#define HALO_SIZE 4
int grid_rows;
int grid_cols;
template<typename T>
__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs = threadIdx.x + HALO_SIZE;
s_curr[shm_ofs] = u_curr[ofs];
__syncthreads();
if (threadIdx.x < 4)
{
s_curr[threadIdx.x] = 0.0;
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#if 0
template<typename T>
__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[2*HALO_SIZE][32];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs_x = threadIdx.x;
int shm_ofs_y = threadIdx.y;
s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
__syncthreads();
if (threadIdx.y < 4)
{
s_curr[threadIdx.x] = 0.0;
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#endif
int main(void)
{
#ifdef SINGLE_PRECISION
using T = float;
#else
using T = double;
#endif
/**** Get device properties ****/
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, 0) );
printf("Device Name: %s\n", prop.name);
printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
/**** Initialize constants ****/
T w0 = -205.0/72.0;
T w1 = 8.0/5.0;
T w2 = -1.0/5.0;
T w3 = 8.0/315.0;
T w4 = -1.0/560.0;
T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
/**** Prepare for timing ****/
float milliseconds;
cudaEvent_t startEvent, stopEvent;
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
/**** Prepare grid and copy to device ****/
fd_grid<T> g(GRID_ROWS, GRID_COLS);
for (size_t i = 0; i < GRID_ROWS; i++)
for (size_t j = 0; j < GRID_COLS; j++)
g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
//g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
T* u_curr;
T* u_next;
checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
dim3 gs(1,GRID_ROWS);
dim3 tpb(GRID_COLS,1);
checkCuda( cudaEventRecord(startEvent, 0) );
//for (size_t i = 0; i < 1; i++)
x_kernel<<<gs, tpb>>>(u_curr, u_next);
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
checkCuda( cudaDeviceSynchronize() );
checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );
checkCuda( cudaFree(u_next) );
checkCuda( cudaFree(u_curr) );
visit_dump(g, "cuda.silo");
return 0;
}