Skip to content
Snippets Groups Projects
Commit f0386bda authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Added modular finite difference test code 'fd_catalog'.

parent cd0cbd2e
Branches
No related tags found
No related merge requests found
project(kokkos-testing) project(kokkos-testing)
cmake_minimum_required(VERSION 3.10) cmake_minimum_required(VERSION 3.10)
find_package(CUDA)
add_subdirectory(kokkos) add_subdirectory(kokkos)
add_executable(finite_difference_kokkos finite_difference_kokkos.cpp) add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)
......
set term postscript enhanced color set term postscript enhanced color
set output 'timing.eps' set output 'timing.eps'
set title '2D damped wave equation, 1024x1024 grid' set title '2D damped wave equation'
set style data histogram set style data histogram
set style fill solid border -1 set style fill solid border -1
set logscale y set logscale y
set grid ytics mytics set grid ytics mytics
plot 'timings2.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col plot 'timings-float.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col
project(fd_catalog)
cmake_minimum_required(VERSION 3.10)
find_package(CUDA)
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#pragma once
#include "fd_grid.hpp"
#ifdef HAVE_SILO
#include <silo.h>
#endif /* HAVE_SILO */
#ifdef HAVE_SILO
template<typename T>
int
visit_dump(const fd_grid<T>& g, const std::string& fn)
{
static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = g.grid_cols();
auto size_y = g.grid_rows();
std::vector<T> x(size_x);
std::vector<T> y(size_y);
auto dx = g.dx();
auto px = -dx * g.halo();
for (size_t i = 0; i < size_x; i++)
x.at(i) = px + i*dx;
auto dy = g.dy();
auto py = -dy * g.halo();
for (size_t i = 0; i < size_y; i++)
y.at(i) = py + i*dy;
int dims[] = { int(size_x), int(size_y) };
int ndims = 2;
T *coords[] = {x.data(), y.data()};
if (std::is_same<T,float>::value)
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
}
else
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
}
DBClose(db);
return 0;
}
#endif /* HAVE_SILO */
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#pragma once
#include <vector>
#include <cstdint>
template<typename T, typename UInt = uint64_t, typename Int = int64_t>
class fd_grid
{
std::vector<T> m_data;
UInt m_grid_rows, m_grid_cols;
UInt m_domain_rows, m_domain_cols, m_halo;
public:
fd_grid()
{}
fd_grid(UInt rows, UInt cols, UInt halo) :
m_grid_rows(rows+2*halo),
m_grid_cols(cols+2*halo),
m_domain_rows(rows),
m_domain_cols(cols),
m_halo(halo),
m_data( (rows+2*halo) * (cols+2*halo) )
{}
fd_grid(fd_grid&& other)
{
m_grid_rows = other.m_grid_rows;
m_grid_cols = other.m_grid_cols;
m_domain_rows = other.m_domain_rows;
m_domain_cols = other.m_domain_cols;
m_halo = other.m_halo;
m_data = std::move(other.m_data);
}
fd_grid& operator=(const fd_grid& other)
{
m_grid_rows = other.m_grid_rows;
m_grid_cols = other.m_grid_cols;
m_domain_rows = other.m_domain_rows;
m_domain_cols = other.m_domain_cols;
m_halo = other.m_halo;
m_data.resize( other.m_data.size() );
std::copy(other.m_data.begin(), other.m_data.end(), m_data.begin());
return *this;
}
fd_grid& operator=(fd_grid&& other)
{
m_grid_rows = other.m_grid_rows;
m_grid_cols = other.m_grid_cols;
m_domain_rows = other.m_domain_rows;
m_domain_cols = other.m_domain_cols;
m_halo = other.m_halo;
m_data = std::move(other.m_data);
return *this;
}
fd_grid& operator=(const T& val)
{
for (auto& d : m_data)
d = val;
return *this;
}
/* Negative indices are allowed to access halo */
T& operator()(Int i, Int j)
{
auto ofs_row = i + m_halo;
assert(ofs_row < m_domain_rows + 2*m_halo);
auto ofs_col = j + m_halo;
assert(ofs_col < m_domain_cols + 2*m_halo);
auto grid_cols = m_domain_cols+2*m_halo;
auto ofs = ofs_row * grid_cols + ofs_col;
assert( (ofs >= 0) and (ofs < m_data.size()) );
return m_data[ofs];
}
/* Negative indices are allowed to access halo */
T operator()(Int i, Int j) const
{
auto ofs_row = i + m_halo;
auto ofs_col = j + m_halo;
auto grid_cols = m_domain_cols+2*m_halo;
auto ofs = ofs_row * grid_cols + ofs_col;
assert( (ofs >= 0) and (ofs < m_data.size()) );
return m_data[ofs];
}
size_t domain_rows() const { return m_domain_rows; }
size_t domain_cols() const { return m_domain_cols; }
size_t halo() const { return m_halo; }
size_t grid_rows() const { return m_grid_rows; }
size_t grid_cols() const { return m_grid_cols; }
T* data() { return m_data.data(); }
const T* data() const { return m_data.data(); }
size_t size() const { return m_data.size(); }
T dx() const { return 1./(m_domain_cols-1); }
T dy() const { return 1./(m_domain_rows-1); }
};
#include <iostream>
#include <fstream>
#include <cstdio>
#include "fd_wave_cpu.hpp"
#ifdef HAVE_CUDA
#include "fd_wave_cuda.hpp"
#endif
int main(void)
{
#ifdef SINGLE_PRECISION
using T = float;
#else
using T = double;
#endif
std::ofstream ofs("timings.txt");
/* Make header */
ofs << "\"SIZE\" \"Seq\" \"SeqBlk\" ";
#ifdef HAVE_CUDA
ofs << "Cuda ";
#endif
auto maxthreads = std::thread::hardware_concurrency();
for (size_t threads = 1; threads <= maxthreads; threads *= 2)
ofs << "\"" << threads << " threads\" ";
ofs << std::endl;
double time;
for (size_t sz = 128; sz <= 1024; sz *= 2)
{
wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 500);
ofs << sz << " ";
wec.init();
time = solve_sequential(wec);
ofs << time << " ";
wec.init();
time = solve_sequential_blocked(wec);
ofs << time << " ";
#ifdef HAVE_CUDA
wec.init();
time = solve_cuda(wec);
ofs << time << " ";
#endif
auto maxthreads = std::thread::hardware_concurrency();
for (size_t threads = 1; threads <= maxthreads; threads *= 2)
{
wec.init();
time = solve_multithread(wec, threads);
ofs << time << " ";
}
ofs << std::endl;
}
}
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#pragma once
#define WAVE_8_HALO_SIZE 4
template<typename T>
struct wave_2D_params
{
int maxrow;
int maxcol;
T dt;
T velocity;
T damping;
};
template<typename T>
struct wave_equation_context
{
fd_grid<T> g_prev;
fd_grid<T> g_curr;
fd_grid<T> g_next;
T velocity;
T damping;
T dt;
int maxiter;
wave_equation_context(size_t rows, size_t cols, T vel, T damp, T pdt, int pmaxiter)
{
g_prev = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
g_curr = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
g_next = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
velocity = vel;
damping = damp;
dt = pdt;
maxiter = pmaxiter;
init();
}
void init(void)
{
for (size_t i = 0; i < g_curr.domain_rows(); i++)
{
for (size_t j = 0; j < g_curr.domain_cols(); j++)
{
T y = g_curr.dy()*i - 0.3;
T x = g_curr.dx()*j - 0.1;
g_prev(i,j) = -std::exp(-2400*(x*x + y*y));
g_curr(i,j) = 2*dt*g_prev(i,j);
g_next(i,j) = 0.0;
}
}
}
};
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#include <iostream>
#include <thread>
#include <condition_variable>
#include <cassert>
#include <list>
#include <cmath>
#include "fd_grid.hpp"
#include "fd_dataio.hpp"
#include "fd_wave.hpp"
struct tile
{
int dim_x;
int dim_y;
int idx_x;
int idx_y;
tile() {}
tile(int dx, int dy, int ix, int iy)
: dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
{}
};
std::ostream&
operator<<(std::ostream& os, const tile& tl)
{
os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
return os;
}
/* This is the naive implementation of a wave equation kernel. You can specify
* a tile to compute only on a subdomain. */
template<typename T>
void
wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params, tile tl)
{
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
T c = params.velocity;
T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
for (size_t i = 0; i < tl.dim_y; i++)
{
for (size_t j = 0; j < tl.dim_x; j++)
{
/* Get X/Y coordinates of this grid point */
int ofs_x = tl.dim_x * tl.idx_x + j;
int ofs_y = tl.dim_y * tl.idx_y + i;
if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
{
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T deriv_x = 0.0;
//deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T deriv_y = 0.0;
//deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x);
T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x);
if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
val = 0;
g_next(ofs_y, ofs_x) = val;
}
}
}
}
/* This is a "blocked" kernel. It copies a block on a "private" chunk of
* memory, computes there and stores the result back. This approach looks
* slightly faster than the "naive" kernel. */
#define WAVE_8_CPUKER_COLS 56
#define WAVE_8_CPUKER_ROWS 56
template<typename T>
void
wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params, tile tl)
{
const int maxrow = params.maxrow;
const int maxcol = params.maxcol;
const T dt = params.dt;
const T c = params.velocity;
const T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE;
static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE;
T s_curr[MX_SZ][MY_SZ];
for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS)
{
for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS)
{
/* Load working tile */
for (size_t i = 0; i < MY_SZ; i++)
{
for (size_t j = 0; j < MX_SZ; j++)
{
int ofs_x = tl.idx_x * tl.dim_x + bj + j;
int ofs_y = tl.idx_y * tl.dim_y + bi + i;
if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) )
s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE);
}
}
/* Do finite differences */
for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++)
{
for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++)
{
/* Get X/Y coordinates of this grid point */
int ofs_x = tl.idx_x * tl.dim_x + bj + j;
int ofs_y = tl.idx_y * tl.dim_y + bi + i;
if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
{
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T deriv_x = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T deriv_y = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = (deriv_x + deriv_y)
- (1.0 - a*dt) * g_prev(ofs_y, ofs_x)
+ (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
val = 0;
g_next(ofs_y, ofs_x) = val;
}
}
}
}
}
}
template<typename T, bool blocked>
double
solve_sequential_aux(wave_equation_context<T>& wec)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
tile tl;
tl.dim_x = params.maxcol;
tl.dim_y = params.maxrow;
tl.idx_x = 0;
tl.idx_y = 0;
double time = 0.0;
std::ofstream ofs("itertime.txt");
for (size_t i = 0; i < wec.maxiter; i++)
{
auto start = std::chrono::high_resolution_clock::now();
if (blocked)
wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
else
wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (i%100) == 0 )
{
std::stringstream ss;
ss << "wave_tiled_" << i << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
ofs << i << " " << ms.count() << std::endl;
}
if (blocked)
{
std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
}
else
{
std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
}
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_sequential_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}
template<typename T>
double solve_sequential(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,false>(wec);
}
template<typename T>
double solve_sequential_blocked(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,true>(wec);
}
template<typename T>
double solve_multithread(wave_equation_context<T>& wec, size_t nths)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
/* Tile sizes */
size_t tile_size_x = 32;
size_t tile_size_y = 32;
size_t num_tiles_x = params.maxcol/tile_size_x + 1;
size_t num_tiles_y = params.maxrow/tile_size_y + 1;
/* Multithreading stuff */
std::mutex cout_mtx;
std::mutex cv_mtx;
std::condition_variable prod_cv;
std::condition_variable cons_cv;
std::list<tile> tiles;
size_t running_threads;
bool iteration_finished = false;
std::vector<bool> thread_done(nths);
for (size_t i = 0; i < nths; i++)
thread_done[i] = true;
auto thread_lambda = [&](int tid) {
while (1)
{
/* Wait for the producer to notify that there's something to do */
{
std::unique_lock<std::mutex> lck(cv_mtx);
while ( thread_done[tid] )
cons_cv.wait(lck);
if (iteration_finished)
{
assert(tiles.size() == 0);
return;
}
}
/* Fetch work and process data */
while (1)
{
tile tl;
{
std::unique_lock<std::mutex> lck(cv_mtx);
if (tiles.size() == 0)
break;
assert(tiles.size() != 0);
tl = tiles.front();
tiles.pop_front();
}
wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
}
/* Work for this thread finished, notify producer */
std::unique_lock<std::mutex> lck(cv_mtx);
//assert(running_threads > 0);
running_threads--;
prod_cv.notify_one();
thread_done[tid] = true;
}
};
std::vector<std::thread> threads(nths);
for (size_t i = 0; i < nths; i++)
threads[i] = std::thread(thread_lambda, i);
double time = 0.0;
for (size_t iter = 0; iter < wec.maxiter; iter++)
{
auto start = std::chrono::high_resolution_clock::now();
std::unique_lock<std::mutex> lck(cv_mtx);
for (size_t i = 0; i < num_tiles_y; i++)
for (size_t j = 0; j < num_tiles_x; j++)
tiles.push_back( tile(tile_size_x, tile_size_y, j, i) );
/* Mark data ready and start the threads */
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
cons_cv.notify_all();
while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
prod_cv.wait(lck);
assert(tiles.size() == 0);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (iter%100) == 0 )
{
std::stringstream ss;
ss << "wave_mt_" << iter << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
}
/* Tell all the threads to stop */
{
std::unique_lock<std::mutex> lck(cv_mtx);
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
iteration_finished = true;
cons_cv.notify_all();
}
/* Wait for all the threads to finish */
for (auto& th : threads)
th.join();
std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
std::cout << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
std::cout << time << "ms" << std::endl;
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_tiled_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#include <iostream>
#include "fd_grid.hpp"
#include "fd_dataio.hpp"
#include "fd_wave.hpp"
#define DEBUG
#include "cuda_runtime.h"
inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
/* 8th order 2nd derivative weights */
__constant__ float d_w32[9];
__constant__ double d_w64[9];
#define WAVE_8_KER_ROWS 32
#define WAVE_8_KER_COLS 32
template<typename T>
__global__ void
wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
{
// HHHH
// H****H H = halo, * = data
// H****H We load a tile as in figure. This wastes bandwidth
// H****H because halos are loaded two times each, but it is
// H****H the easiest way to remain generic w.r.t. grid size
// HHHH
__shared__ T s_curr[WAVE_8_KER_ROWS+2*WAVE_8_HALO_SIZE][WAVE_8_KER_COLS+2*WAVE_8_HALO_SIZE];
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
T c = params.velocity;
T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x + WAVE_8_HALO_SIZE;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y + WAVE_8_HALO_SIZE;
/* Get offset of this grid point in the fd grid array */
int ofs = (maxcol + 2*WAVE_8_HALO_SIZE) * ofs_y + ofs_x;
if ( (ofs_x < (maxcol + WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow + WAVE_8_HALO_SIZE)) )
{
int i = threadIdx.y + WAVE_8_HALO_SIZE;
int j = threadIdx.x + WAVE_8_HALO_SIZE;
s_curr[i][j] = u_curr[ofs];
if (threadIdx.x < WAVE_8_HALO_SIZE)
{
/* X-halo */
int ox = min(WAVE_8_KER_COLS, maxcol-blockDim.x * blockIdx.x);
s_curr[i][j-WAVE_8_HALO_SIZE] = u_curr[ofs-WAVE_8_HALO_SIZE];
s_curr[i][j+ox] = u_curr[ofs+ox];
}
if (threadIdx.y < WAVE_8_HALO_SIZE)
{
/* Y-halo */
int oy = min(WAVE_8_KER_ROWS, maxrow-blockDim.y * blockIdx.y);
int b_ofs = ofs - WAVE_8_HALO_SIZE * (2*WAVE_8_HALO_SIZE+maxcol);
int t_ofs = ofs + oy * (2*WAVE_8_HALO_SIZE+maxcol);
s_curr[i-WAVE_8_HALO_SIZE][j] = u_curr[b_ofs];
s_curr[i+oy][j] = u_curr[t_ofs];
}
__syncthreads();
T* w;
if ( std::is_same<T,float>::value)
w = (T*)d_w32;
if ( std::is_same<T,double>::value)
w = (T*)d_w64;
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T deriv_x = 0.0;
//deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i][j+k];
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T deriv_y = 0.0;
//deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+k][j];
T val = (deriv_x + deriv_y) - (1.0 - a*dt) * u_prev[ofs] + (2.0 - a*dt)*s_curr[i][j];
if ( (ofs_x == WAVE_8_HALO_SIZE) or (ofs_y == WAVE_8_HALO_SIZE) or
(ofs_x == maxcol+WAVE_8_HALO_SIZE-1) or (ofs_y == maxrow+WAVE_8_HALO_SIZE-1)
)
val = 0;
u_next[ofs] = val;
}
}
template<typename T>
double solve_cuda_aux(wave_equation_context<T>& wec)
{
/* Get device properties */
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, 0) );
printf("Device: %s (Compute Capability: %d.%d)\n",
prop.name, prop.major, prop.minor);
/**** Initialize constants ****/
T w0 = -205.0/72.0;
T w1 = 8.0/5.0;
T w2 = -1.0/5.0;
T w3 = 8.0/315.0;
T w4 = -1.0/560.0;
T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
if (std::is_same<T,float>::value)
checkCuda( cudaMemcpyToSymbol(d_w32, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
if (std::is_same<T,double>::value)
checkCuda( cudaMemcpyToSymbol(d_w64, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
/* Allocate memory on device */
T* u_prev;
T* u_curr;
T* u_next;
T* u_temp;
checkCuda( cudaMalloc((void**)&u_prev, wec.g_prev.size()*sizeof(T)) );
checkCuda( cudaMalloc((void**)&u_curr, wec.g_curr.size()*sizeof(T)) );
checkCuda( cudaMalloc((void**)&u_next, wec.g_next.size()*sizeof(T)) );
/* Copy grids on device */
checkCuda( cudaMemcpy(u_prev, wec.g_prev.data(), wec.g_prev.size()*sizeof(T), cudaMemcpyHostToDevice) );
checkCuda( cudaMemcpy(u_curr, wec.g_curr.data(), wec.g_curr.size()*sizeof(T), cudaMemcpyHostToDevice) );
checkCuda( cudaMemcpy(u_next, wec.g_next.data(), wec.g_next.size()*sizeof(T), cudaMemcpyHostToDevice) );
/* Launch parameters */
dim3 grid_size(wec.g_curr.domain_cols()/WAVE_8_KER_COLS + 1, wec.g_curr.domain_rows()/WAVE_8_KER_ROWS + 1);
dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
/* Prepare for timing */
cudaEvent_t startEvent, stopEvent;
checkCuda( cudaEventCreate(&startEvent) );
checkCuda( cudaEventCreate(&stopEvent) );
checkCuda( cudaEventRecord(startEvent, 0) );
for (size_t i = 0; i < wec.maxiter; i++)
{
wave_2D_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params);
u_temp = u_prev;
u_prev = u_curr;
u_curr = u_next;
u_next = u_temp;
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (i%100) == 0 )
{
checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) );
std::stringstream ss;
ss << "wave_cuda_" << i << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
}
float milliseconds;
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
std::cout << "[Wave][Cuda] Iteration Time: " << milliseconds/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl;
#ifdef HAVE_SILO
checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) );
visit_dump(wec.g_curr, "wave_cuda_lastiter.silo");
#endif /* HAVE_SILO */
return milliseconds/wec.maxiter;
}
double solve_cuda(wave_equation_context<float>& wec)
{
return solve_cuda_aux(wec);
}
double solve_cuda(wave_equation_context<double>& wec)
{
return solve_cuda_aux(wec);
}
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#pragma once
double solve_cuda(wave_equation_context<float>&);
double solve_cuda(wave_equation_context<double>&);
\ No newline at end of file
...@@ -85,7 +85,7 @@ visit_dump(const fd_grid<double>& g, const std::string& fn) ...@@ -85,7 +85,7 @@ visit_dump(const fd_grid<double>& g, const std::string& fn)
return 0; return 0;
} }
//#define HIGH_ORDER #define HIGH_ORDER
#ifdef HIGH_ORDER #ifdef HIGH_ORDER
#define HALO_SIZE 4 #define HALO_SIZE 4
...@@ -97,16 +97,16 @@ int main( int argc, char* argv[] ) ...@@ -97,16 +97,16 @@ int main( int argc, char* argv[] )
{ {
using namespace std::chrono; using namespace std::chrono;
int Nx = 1024; int Nx = 500;
int Ny = 1024; int Ny = 500;
int halo = HALO_SIZE; int halo = HALO_SIZE;
int rows = Ny + 2*halo; int rows = Ny + 2*halo;
int cols = Nx + 2*halo; int cols = Nx + 2*halo;
int timesteps = 500; int timesteps = 5000;
double dx = 1./(Nx-1); double dx = 1./(Nx-1);
double dy = 1./(Ny-1); double dy = 1./(Ny-1);
double dt = 0.0001; double dt = 0.001;
double velocity = 1; double velocity = 1;
double damping = 0.1; double damping = 0.1;
......
#include <iostream> #include <iostream>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include <cstdio>
#include <cassert>
#include <silo.h>
#define DEBUG
#define SINGLE_PRECISION
template<typename T>
class fd_grid
{
std::vector<T> m_data;
size_t m_rows, m_cols;
public:
fd_grid()
{}
fd_grid(size_t rows, size_t cols) :
m_data(rows*cols), m_rows(rows), m_cols(cols)
{}
fd_grid(fd_grid&& other)
{
std::swap(m_data, other.m_data);
}
fd_grid& operator=(fd_grid&& other)
{
std::swap(m_data, other.m_data);
return *this;
}
T operator()(size_t i, size_t j) const
{
return m_data[i*m_cols + j];
}
T& operator()(size_t i, size_t j)
{
return m_data[i*m_cols + j];
}
size_t rows() const { return m_rows; }
size_t cols() const { return m_cols; }
T* data() { return m_data.data(); }
const T* data() const { return m_data.data(); }
};
template<typename T>
int
visit_dump(const fd_grid<T>& g, const std::string& fn)
{
static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
DBfile *db = nullptr;
db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
if (!db)
{
std::cout << "Cannot write simulation output" << std::endl;
return -1;
}
auto size_x = g.cols();
auto size_y = g.rows();
std::vector<T> x(size_x);
std::vector<T> y(size_y);
for (size_t i = 0; i < size_x; i++)
x.at(i) = T(i)/(size_x-1);
for (size_t i = 0; i < size_y; i++)
y.at(i) = T(i)/(size_y-1);
int dims[] = { int(size_y), int(size_y) };
int ndims = 2;
T *coords[] = {x.data(), y.data()};
if (std::is_same<T,float>::value)
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
}
else
{
DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);
DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
}
DBClose(db);
return 0;
}
inline cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
#ifdef SINGLE_PRECISION
__constant__ float d_w[9];
#else
__constant__ double d_w[9];
#endif
template<typename T> #include <sstream>
__global__ void test_kernel(T* data) #include <cstdio>
{
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
}
#define GRID_ROWS 32
#define GRID_COLS 32
#define HALO_SIZE 4
int grid_rows;
int grid_cols;
template<typename T>
__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs = threadIdx.x + HALO_SIZE;
s_curr[shm_ofs] = u_curr[ofs];
__syncthreads();
if (threadIdx.x < 4)
{
s_curr[threadIdx.x] = 0.0;
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#if 0
template<typename T>
__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
{
__shared__ T s_curr[2*HALO_SIZE][32];
/* Get X/Y coordinates of this grid point */
int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
/* Get offset of this grid point in the fd grid array */
int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
int shm_ofs_x = threadIdx.x;
int shm_ofs_y = threadIdx.y;
s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
__syncthreads();
if (threadIdx.y < 4) #include "fd_grid.hpp"
{ #include "fd_dataio.hpp"
s_curr[threadIdx.x] = 0.0; #include "fd_wave_cpu.hpp"
s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
}
__syncthreads();
T dx = 0.0;
for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
u_next[ofs] = dx;
}
#endif
int main(void) int main(void)
{ {
#ifdef SINGLE_PRECISION #ifdef SINGLE_PRECISION
using T = float; using T = float;
#else #else
using T = double; using T = double;
#endif #endif
/**** Get device properties ****/ wave_equation_context<T> wec(512, 512, 1, 0.1, 0.0001, 5000);
cudaDeviceProp prop;
checkCuda( cudaGetDeviceProperties(&prop, 0) );
printf("Device Name: %s\n", prop.name);
printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
/**** Initialize constants ****/
T w0 = -205.0/72.0;
T w1 = 8.0/5.0;
T w2 = -1.0/5.0;
T w3 = 8.0/315.0;
T w4 = -1.0/560.0;
T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
/**** Prepare for timing ****/ #if 0
float milliseconds; #ifdef __NVCC__
cudaEvent_t startEvent, stopEvent; wec.init();
checkCuda( cudaEventCreate(&startEvent) ); solve_cuda(wec);
checkCuda( cudaEventCreate(&stopEvent) ); #endif /* __NVCC__ */
#endif
/**** Prepare grid and copy to device ****/
fd_grid<T> g(GRID_ROWS, GRID_COLS);
for (size_t i = 0; i < GRID_ROWS; i++)
for (size_t j = 0; j < GRID_COLS; j++)
g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
//g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
T* u_curr;
T* u_next;
checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
dim3 gs(1,GRID_ROWS);
dim3 tpb(GRID_COLS,1);
checkCuda( cudaEventRecord(startEvent, 0) );
//for (size_t i = 0; i < 1; i++)
x_kernel<<<gs, tpb>>>(u_curr, u_next);
checkCuda( cudaEventRecord(stopEvent, 0) );
checkCuda( cudaEventSynchronize(stopEvent) );
checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
checkCuda( cudaDeviceSynchronize() );
checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) ); wec.init();
solve_sequential(wec);
checkCuda( cudaFree(u_next) ); wec.init();
checkCuda( cudaFree(u_curr) ); solve_sequential_blocked(wec);
visit_dump(g, "cuda.silo"); auto maxthreads = std::thread::hardware_concurrency();
return 0; for (size_t i = 1; i <= maxthreads; i *= 2)
{
wec.init();
solve_multithread(wec, i);
}
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment