Skip to content
Snippets Groups Projects
Commit 551bb093 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Added Kokkos test snippets.

parents
No related branches found
No related tags found
No related merge requests found
project(kokkos-testing)
cmake_minimum_required(VERSION 3.10)
add_subdirectory(kokkos)
add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)
target_link_libraries(finite_difference_kokkos Kokkos::kokkos siloh5)
add_executable(finite_difference_baseline finite_difference_baseline.cpp)
target_link_libraries(finite_difference_baseline siloh5)
Testing code for Kokkos/OpenACC/CUDA
You have to
git clone https://github.com/kokkos/kokkos.git
before compiling anything
#include <iostream>
#include <fstream>
#include <sstream>
#include <sys/time.h>
#include <vector>
#include <cmath>
#include <chrono>
#include <silo.h>
template<typename T>
class fd_grid
{
std::vector<T> m_data;
size_t m_rows, m_cols;
public:
fd_grid()
{}
fd_grid(size_t rows, size_t cols) :
m_data(rows*cols), m_rows(rows), m_cols(cols)
{}
fd_grid(fd_grid&& other)
{
std::swap(m_data, other.m_data);
}
fd_grid& operator=(fd_grid&& other)
{
std::swap(m_data, other.m_data);
return *this;
}
T operator()(size_t i, size_t j) const
{
return m_data[i*m_cols + j];
}
T& operator()(size_t i, size_t j)
{
return m_data[i*m_cols + j];
}
size_t rows() const { return m_rows; }
size_t cols() const { return m_cols; }
T* data() { return m_data.data(); }
const T* data() const { return m_data.data(); }
};
int
visit_dump(const fd_grid<double>& g, const std::string& fn)
{
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = g.cols();
auto size_y = g.rows();
std::vector<double> x(size_x);
std::vector<double> y(size_y);
for (size_t i = 0; i < size_x; i++)
x.at(i) = double(i)/(size_x-1);
for (size_t i = 0; i < size_y; i++)
y.at(i) = double(i)/(size_y-1);
int dims[] = { int(size_y), int(size_y) };
int ndims = 2;
double *coords[] = {x.data(), y.data()};
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
DBClose(db);
return 0;
}
//#define HIGH_ORDER
#ifdef HIGH_ORDER
#define HALO_SIZE 4
#else
#define HALO_SIZE 1
#endif
int main( int argc, char* argv[] )
{
using namespace std::chrono;
int Nx = 1024;
int Ny = 1024;
int halo = HALO_SIZE;
int rows = Ny + 2*halo;
int cols = Nx + 2*halo;
int timesteps = 500;
double dx = 1./(Nx-1);
double dy = 1./(Ny-1);
double dt = 0.0001;
double velocity = 1;
double damping = 0.1;
auto c2 = velocity*velocity;
auto dx2 = dx*dx;
auto dy2 = dy*dy;
auto dt2 = dt*dt;
auto D1 = 1./(1. + 0.5*damping*dt);
auto D2 = 0.5*damping*dt - 1;
typedef fd_grid<double> matrix_view_type;
matrix_view_type u_prev(rows, cols);
matrix_view_type u_curr(rows, cols);
matrix_view_type u_next(rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
if ( i < halo or i > rows-halo or j < halo or j > cols-halo)
{
u_prev(i,j) = 0.0;
u_curr(i,j) = 0.0;
}
else
{
double x = dx*i - 0.5;
double y = dy*j - 0.1;
u_prev(i,j) = -std::exp(-2400*(x*x + y*y));
u_curr(i,j) = 2*dt*u_prev(i,j);
}
}
}
double iter_time = 0.0;
double w0 = -205.0/72.0;
double w1 = 8.0/5.0;
double w2 = -1.0/5.0;
double w3 = 8.0/315.0;
double w4 = -1.0/560.0;
double w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
for (int ts = 0; ts < timesteps; ts++)
{
auto t_begin = high_resolution_clock::now();
for (size_t i = halo; i < rows - halo; i++)
{
for (size_t j = halo; j < cols - halo; j++)
{
/* 2nd order in space */
#ifndef HIGH_ORDER
u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+ (c2*dt2/dy2)*(u_curr(i+1,j) - 2.0*u_curr(i,j) + u_curr(i-1,j))
+ (c2*dt2/dx2)*(u_curr(i,j+1) - 2.0*u_curr(i,j) + u_curr(i,j-1)));
if (j == 1)
u_next(i,j-1) = 0;//u_next(i,j);
if (j == cols-2)
u_next(i,j+1) = 0;//u_next(i,j);
if (i == 1)
u_next(i-1,j) = 0;//u_next(i,j);
if (i == rows-2)
u_next(i+1,j) = 0;//u_next(i,j);
#else
/* 8th order in space */
double deriv_x = 0.0;
double deriv_y = 0.0;
for (int k = -4; k <= 4; k++)
{
deriv_x += w[k+4]*u_curr(i+k,j);
deriv_y += w[k+4]*u_curr(i,j+k);
}
u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+ (c2*dt2/dy2)*(deriv_y)
+ (c2*dt2/dx2)*(deriv_x));
if (j == halo)
for (size_t k = 0; k < halo; k++)
u_next(i,k) = 0;
if (j == cols-(halo+1))
for (size_t k = 0; k < halo; k++)
u_next(i,j+k+1) = 0;
if (i == 1)
for (size_t k = 0; k < halo; k++)
u_next(k,j) = 0;
if (i == rows-(halo+1))
for (size_t k = 0; k < halo; k++)
u_next(i+k+1,j) = 0;
#endif
}
}
std::swap(u_prev, u_curr);
std::swap(u_curr, u_next);
auto t_end = high_resolution_clock::now();
duration<double> time_span = duration_cast<duration<double>>(t_end - t_begin);
iter_time += time_span.count();
/*
if ( (ts % 100) == 0 )
{
std::stringstream ss;
ss << "solution_" << ts << ".silo";
visit_dump(u_curr, ss.str());
}
*/
}
std::cout << "Average iteration time: " << iter_time/timesteps << std::endl;
return 0;
}
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <cmath>
#include <chrono>
#include <type_traits>
#include <silo.h>
#include <Kokkos_Core.hpp>
template<typename T>
int
visit_dump(const Kokkos::View<T**>& kv, const std::string& fn)
{
static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value,
"Only double or float");
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = kv.extent(0);
auto size_y = kv.extent(1);
std::vector<T> x(size_x);
std::vector<T> y(size_y);
for (size_t i = 0; i < size_x; i++)
x.at(i) = T(i)/(size_x-1);
for (size_t i = 0; i < size_y; i++)
y.at(i) = T(i)/(size_y-1);
int dims[] = { int(size_y), int(size_y) };
int ndims = 2;
T *coords[] = { x.data(), y.data() };
if (std::is_same<T,float>::value)
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
DB_FLOAT, DB_COLLINEAR, NULL);
if (std::is_same<T,double>::value)
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
DB_DOUBLE, DB_COLLINEAR, NULL);
std::vector<T> data(x.size() * y.size());
for (size_t i = 0; i < x.size(); i++)
for (size_t j = 0; j < y.size(); j++)
data.at(i*y.size()+j) = kv(i,j);
if (std::is_same<T,float>::value)
DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
if (std::is_same<T,double>::value)
DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
DBClose(db);
return 0;
}
#define HIGH_ORDER
#ifdef HIGH_ORDER
#define HALO_SIZE 4
#else
#define HALO_SIZE 1
#endif
int main(int argc, char *argv[])
{
using T = double;
using namespace Kokkos;
using namespace std::chrono;
int Nx = 1024;
int Ny = 1024;
int halo = HALO_SIZE;
int rows = Ny + 2*halo;
int cols = Nx + 2*halo;
int timesteps = 5000;
T dx = 1./(Nx-1);
T dy = 1./(Ny-1);
T dt = 0.0001;
T velocity = 1;
T damping = 0.1;
auto c2 = velocity*velocity;
auto dx2 = dx*dx;
auto dy2 = dy*dy;
auto dt2 = dt*dt;
auto D1 = 1./(1. + 0.5*damping*dt);
auto D2 = 0.5*damping*dt - 1;
Kokkos::initialize( argc, argv );
{
typedef View<T**> matrix_view_type;
matrix_view_type u_prev("u_next", rows, cols);
matrix_view_type u_curr("u_curr", rows, cols);
matrix_view_type u_next("u_next", rows, cols);
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
if ( i < halo or i > rows-halo or j < halo or j > cols-halo)
{
u_prev(i,j) = 0.0;
u_curr(i,j) = 0.0;
}
else
{
T x = dx*i - 0.5;
T y = dy*j - 0.1;
u_prev(i,j) = -std::exp(-2400*(x*x + y*y));
u_curr(i,j) = 2*dt*u_prev(i,j);
}
}
}
MDRangePolicy<Rank<2>> range({halo,halo}, {rows-(halo+1), cols-(halo+1)});
#ifdef HIGH_ORDER
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
#endif
double iter_time = 0.0;
for (int ts = 0; ts < timesteps; ts++)
{
auto t_begin = high_resolution_clock::now();
parallel_for(range, KOKKOS_LAMBDA(int i, int j)
{
#ifndef HIGH_ORDER
u_next(i,j) = D1 * (
2.0*u_curr(i,j)
+ D2*u_prev(i,j) +
+ (c2*dt2/dy2) *
(u_curr(i+1,j) - 2.0*u_curr(i,j) + u_curr(i-1,j))
+ (c2*dt2/dx2) *
(u_curr(i,j+1) - 2.0*u_curr(i,j) + u_curr(i,j-1))
);
if (j == 1)
u_next(i,j-1) = 0
if (j == cols-2)
u_next(i,j+1) = 0;
if (i == 1)
u_next(i-1,j) = 0;
if (i == rows-2)
u_next(i+1,j) = 0;
#else
/* 8th order in space */
T deriv_x = 0.0;
T deriv_y = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
{
deriv_x += w[k+HALO_SIZE]*u_curr(i+k,j);
deriv_y += w[k+HALO_SIZE]*u_curr(i,j+k);
}
u_next(i,j) = D1*(2.0*u_curr(i,j) + D2*u_prev(i,j) +
+ (c2*dt2/dy2)*(deriv_y)
+ (c2*dt2/dx2)*(deriv_x));
if (j == halo)
for (size_t k = 0; k < halo; k++)
u_next(i,k) = 0;
if (j == cols-(halo+1))
for (size_t k = 0; k < halo; k++)
u_next(i,j+k+1) = 0;
if (i == 1)
for (size_t k = 0; k < halo; k++)
u_next(k,j) = 0;
if (i == rows-(halo+1))
for (size_t k = 0; k < halo; k++)
u_next(i+k+1,j) = 0;
#endif
});
std::swap(u_prev, u_curr);
std::swap(u_curr, u_next);
auto t_end = high_resolution_clock::now();
duration<double> time_span =
duration_cast<duration<double>>(t_end - t_begin);
iter_time += time_span.count();
if ( (ts % 100) == 0 )
{
std::stringstream ss;
ss << "solution_" << ts << ".silo";
visit_dump(u_curr, ss.str());
}
}
double avg_iter_time = iter_time/timesteps;
std::cout << "Average iteration time: " << avg_iter_time << std::endl;
}
Kokkos::finalize();
return 0;
}
#include <iostream>
#include <fstream>
#include <vector>
#include <cstdio>
#include <cassert>
#include <silo.h>
#define DEBUG
#define SINGLE_PRECISION
template<typename T>
class fd_grid
{
std::vector<T> m_data;
size_t m_rows, m_cols;
public:
fd_grid()
{}
fd_grid(size_t rows, size_t cols) :
m_data(rows*cols), m_rows(rows), m_cols(cols)
{}
fd_grid(fd_grid&& other)
{
std::swap(m_data, other.m_data);
}
fd_grid& operator=(fd_grid&& other)
{
std::swap(m_data, other.m_data);
return *this;
}
T operator()(size_t i, size_t j) const
{
return m_data[i*m_cols + j];
}
T& operator()(size_t i, size_t j)
{
return m_data[i*m_cols + j];
}
size_t rows() const { return m_rows; }
size_t cols() const { return m_cols; }
T* data() { return m_data.data(); }
const T* data() const { return m_data.data(); }
};
template<typename T>
int
visit_dump(const fd_grid<T>& g, const std::string& fn)
{
static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = g.cols();
auto size_y = g.rows();
std::vector<T> x(size_x);
std::vector<T> y(size_y);
for (size_t i = 0; i < size_x; i++)
x.at(i) = T(i)/(size_x-1);
for (size_t i = 0; i < size_y; i++)
y.at(i) = T(i)/(size_y-1);
int dims[] = { int(size_y), int(size_y) };
int ndims = 2;
T *coords[] = {x.data(), y.data()};
if (std::is_same<T,float>::value)
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
}
else
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
}
DBClose(db);
return 0;
}
inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
#ifdef SINGLE_PRECISION
__constant__ float d_w[9];
#else
__constant__ double d_w[9];
#endif
template<typename T>
__global__ void test_kernel(T* data)
{
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
}
#define GRID_ROWS 32
#define GRID_COLS 32
#define HALO_SIZE 4
int grid_rows;
int grid_cols;
template<typename T>
__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs = threadIdx.x + HALO_SIZE;
s_curr[shm_ofs] = u_curr[ofs];
__syncthreads();
if (threadIdx.x < 4)
{
s_curr[threadIdx.x] = 0.0;
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#if 0
template<typename T>
__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[2*HALO_SIZE][32];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs_x = threadIdx.x;
int shm_ofs_y = threadIdx.y;
s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
__syncthreads();
if (threadIdx.y < 4)
{
s_curr[threadIdx.x] = 0.0;
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#endif
int main(void)
{
#ifdef SINGLE_PRECISION
using T = float;
#else
using T = double;
#endif
/**** Get device properties ****/
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, 0) );
printf("Device Name: %s\n", prop.name);
printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
/**** Initialize constants ****/
T w0 = -205.0/72.0;
T w1 = 8.0/5.0;
T w2 = -1.0/5.0;
T w3 = 8.0/315.0;
T w4 = -1.0/560.0;
T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
/**** Prepare for timing ****/
float milliseconds;
cudaEvent_t startEvent, stopEvent;
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
/**** Prepare grid and copy to device ****/
fd_grid<T> g(GRID_ROWS, GRID_COLS);
for (size_t i = 0; i < GRID_ROWS; i++)
for (size_t j = 0; j < GRID_COLS; j++)
g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
//g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
T* u_curr;
T* u_next;
checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
dim3 gs(1,GRID_ROWS);
dim3 tpb(GRID_COLS,1);
checkCuda( cudaEventRecord(startEvent, 0) );
//for (size_t i = 0; i < 1; i++)
x_kernel<<<gs, tpb>>>(u_curr, u_next);
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
checkCuda( cudaDeviceSynchronize() );
checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );
checkCuda( cudaFree(u_next) );
checkCuda( cudaFree(u_curr) );
visit_dump(g, "cuda.silo");
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment