Select Git revision
drawContextFltkStringTexture.h
-
Christophe Geuzaine authoredChristophe Geuzaine authored
fd_wave_cpu.hpp 12.61 KiB
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#include <iostream>
#include <thread>
#include <condition_variable>
#include <cassert>
#include <list>
#include <cmath>
#include "fd_grid.hpp"
#include "fd_dataio.hpp"
#include "fd_wave.hpp"
struct tile
{
int dim_x;
int dim_y;
int idx_x;
int idx_y;
tile() {}
tile(int dx, int dy, int ix, int iy)
: dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
{}
};
std::ostream&
operator<<(std::ostream& os, const tile& tl)
{
os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
return os;
}
/* This is the naive implementation of a wave equation kernel. You can specify
* a tile to compute only on a subdomain. */
template<typename T>
void
wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params, tile tl)
{
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
T c = params.velocity;
T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
for (size_t i = 0; i < tl.dim_y; i++)
{
for (size_t j = 0; j < tl.dim_x; j++)
{
/* Get X/Y coordinates of this grid point */
int ofs_x = tl.dim_x * tl.idx_x + j;
int ofs_y = tl.dim_y * tl.idx_y + i;
if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
{
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T deriv_x = 0.0;
//deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T deriv_y = 0.0;
//deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x);
T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x);
if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
val = 0;
g_next(ofs_y, ofs_x) = val;
}
}
}
}
/* This is a "blocked" kernel. It copies a block on a "private" chunk of
* memory, computes there and stores the result back. This approach looks
* slightly faster than the "naive" kernel. */
#define WAVE_8_CPUKER_COLS 56
#define WAVE_8_CPUKER_ROWS 56
template<typename T>
void
wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params, tile tl)
{
const int maxrow = params.maxrow;
const int maxcol = params.maxcol;
const T dt = params.dt;
const T c = params.velocity;
const T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE;
static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE;
T s_curr[MX_SZ][MY_SZ];
for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS)
{
for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS)
{
/* Load working tile */
for (size_t i = 0; i < MY_SZ; i++)
{
for (size_t j = 0; j < MX_SZ; j++)
{
int ofs_x = tl.idx_x * tl.dim_x + bj + j;
int ofs_y = tl.idx_y * tl.dim_y + bi + i;
if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) )
s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE);
}
}
/* Do finite differences */
for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++)
{
for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++)
{
/* Get X/Y coordinates of this grid point */
int ofs_x = tl.idx_x * tl.dim_x + bj + j;
int ofs_y = tl.idx_y * tl.dim_y + bi + i;
if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
{
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T deriv_x = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T deriv_y = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = (deriv_x + deriv_y)
- (1.0 - a*dt) * g_prev(ofs_y, ofs_x)
+ (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
val = 0;
g_next(ofs_y, ofs_x) = val;
}
}
}
}
}
}
template<typename T, bool blocked>
double
solve_sequential_aux(wave_equation_context<T>& wec)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
tile tl;
tl.dim_x = params.maxcol;
tl.dim_y = params.maxrow;
tl.idx_x = 0;
tl.idx_y = 0;
double time = 0.0;
std::ofstream ofs("itertime.txt");
for (size_t i = 0; i < wec.maxiter; i++)
{
auto start = std::chrono::high_resolution_clock::now();
if (blocked)
wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
else
wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (i%100) == 0 )
{
std::stringstream ss;
ss << "wave_tiled_" << i << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
ofs << i << " " << ms.count() << std::endl;
}
if (blocked)
{
std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
}
else
{
std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
}
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_sequential_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}
template<typename T>
double solve_sequential(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,false>(wec);
}
template<typename T>
double solve_sequential_blocked(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,true>(wec);
}
template<typename T>
double solve_multithread(wave_equation_context<T>& wec, size_t nths)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
/* Tile sizes */
size_t tile_size_x = 32;
size_t tile_size_y = 32;
size_t num_tiles_x = params.maxcol/tile_size_x + 1;
size_t num_tiles_y = params.maxrow/tile_size_y + 1;
/* Multithreading stuff */
std::mutex cout_mtx;
std::mutex cv_mtx;
std::condition_variable prod_cv;
std::condition_variable cons_cv;
std::list<tile> tiles;
size_t running_threads;
bool iteration_finished = false;
std::vector<bool> thread_done(nths);
for (size_t i = 0; i < nths; i++)
thread_done[i] = true;
auto thread_lambda = [&](int tid) {
while (1)
{
/* Wait for the producer to notify that there's something to do */
{
std::unique_lock<std::mutex> lck(cv_mtx);
while ( thread_done[tid] )
cons_cv.wait(lck);
if (iteration_finished)
{
assert(tiles.size() == 0);
return;
}
}
/* Fetch work and process data */
while (1)
{
tile tl;
{
std::unique_lock<std::mutex> lck(cv_mtx);
if (tiles.size() == 0)
break;
assert(tiles.size() != 0);
tl = tiles.front();
tiles.pop_front();
}
wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
}
/* Work for this thread finished, notify producer */
std::unique_lock<std::mutex> lck(cv_mtx);
//assert(running_threads > 0);
running_threads--;
prod_cv.notify_one();
thread_done[tid] = true;
}
};
std::vector<std::thread> threads(nths);
for (size_t i = 0; i < nths; i++)
threads[i] = std::thread(thread_lambda, i);
double time = 0.0;
for (size_t iter = 0; iter < wec.maxiter; iter++)
{
auto start = std::chrono::high_resolution_clock::now();
std::unique_lock<std::mutex> lck(cv_mtx);
for (size_t i = 0; i < num_tiles_y; i++)
for (size_t j = 0; j < num_tiles_x; j++)
tiles.push_back( tile(tile_size_x, tile_size_y, j, i) );
/* Mark data ready and start the threads */
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
cons_cv.notify_all();
while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
prod_cv.wait(lck);
assert(tiles.size() == 0);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (iter%100) == 0 )
{
std::stringstream ss;
ss << "wave_mt_" << iter << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
}
/* Tell all the threads to stop */
{
std::unique_lock<std::mutex> lck(cv_mtx);
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
iteration_finished = true;
cons_cv.notify_all();
}
/* Wait for all the threads to finish */
for (auto& th : threads)
th.join();
std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
std::cout << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
std::cout << time << "ms" << std::endl;
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_tiled_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}