Select Git revision
Forked from
gmsh / gmsh
Source project has a limited visibility.
fd_wave_cpu.hpp 15.73 KiB
/* Performance evaluation of different implementations of finite difference
* schemes.
*
* Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
*/
#include <iostream>
#include <thread>
#include <condition_variable>
#include <cassert>
#include <list>
#include <cmath>
#include <algorithm>
#include <sstream>
#include <string.h>
#include <pmmintrin.h>
#include <xmmintrin.h>
#include "fd_grid.hpp"
#include "fd_dataio.hpp"
#include "fd_wave.hpp"
struct tile
{
int dim_x;
int dim_y;
int idx_x;
int idx_y;
tile() {}
tile(int dx, int dy, int ix, int iy)
: dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
{}
};
std::ostream&
operator<<(std::ostream& os, const tile& tl)
{
os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
return os;
}
#define BLK_ROWS 56
#define BLK_COLS 56
template<typename T>
void
wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params)
{
/* 1) memcpy is faster than for
* 2) if you put inner block processing, you kill perf
*/
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
T c = params.velocity;
T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt);
size_t blocks_x = maxcol / BLK_COLS;
size_t blocks_y = maxrow / BLK_ROWS;
size_t rem_x = maxcol % BLK_COLS;
size_t rem_y = maxrow % BLK_ROWS;
/* Do fixed-size blocks */
for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
{
for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
{
/* Here all the dimensions are known at compile time, let the
* compiler do its job. */
for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
{
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
}
for (size_t i = 0; i < BLK_ROWS; i++)
{
for (size_t j = 0; j < BLK_COLS; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
}
}
}
}
/* Do upper blocks */
{
size_t bi = blocks_y*BLK_ROWS;
for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
{
for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
{
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
}
for (size_t i = 0; i < rem_y; i++)
{
for (size_t j = 0; j < BLK_COLS; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
}
}
}
}
/* Do upper leftmost corner */
{
size_t bi = blocks_y*BLK_ROWS;
size_t bj = blocks_x*BLK_COLS;
for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
{
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
}
for (size_t i = 0; i < rem_y; i++)
{
for (size_t j = 0; j < rem_x; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
}
}
}
/* Do leftmost blocks */
for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
{
size_t bj = blocks_x*BLK_COLS;
for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
{
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
}
for (size_t i = 0; i < BLK_ROWS; i++)
{
for (size_t j = 0; j < rem_x; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
}
}
}
/* Apply BCs */
for (size_t j = 0; j < maxcol; j++)
{
g_next(0,j) = 0;
g_next(maxrow-1, j) = 0;
}
for (size_t i = 0; i < maxrow; i++)
{
g_next(i, 0) = 0;
g_next(i, maxcol-1) = 0;
}
}
template<typename T>
void
wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params,
size_t thread_id = 0, size_t num_threads = 1)
{
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
T c = params.velocity;
T a = params.damping;
assert(maxcol > 1);
assert(maxrow > 1);
/**** Initialize constants ****/
static const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt);
for (size_t i = thread_id; i < maxrow; i += num_threads)
{
for (size_t j = 0; j < maxcol; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i,j+k);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);
T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
val = 0;
g_next(i, j) = val;
}
}
}
template<typename T, bool blocked>
double
solve_sequential_aux(wave_equation_context<T>& wec)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
tile tl;
tl.dim_x = params.maxcol;
tl.dim_y = params.maxrow;
tl.idx_x = 0;
tl.idx_y = 0;
double time = 0.0;
#ifdef SAVE_ITERTIME
std::ofstream ofs;
if (blocked)
ofs.open("itertime-blocked.txt");
else
ofs.open("itertime-naive.txt");
#endif /* SAVE_ITERTIME */
for (size_t i = 0; i < wec.maxiter; i++)
{
auto start = std::chrono::high_resolution_clock::now();
if (blocked)
wave_2D_kernel_blk(wec.g_prev, wec.g_curr, wec.g_next, params);
else
wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (i%100) == 0 )
{
std::stringstream ss;
if (blocked)
ss << "wave_seqblk_" << i << ".silo";
else
ss << "wave_seq_" << i << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
#ifdef SAVE_ITERTIME
ofs << i << " " << ms.count() << std::endl;
#endif /* SAVE_ITERTIME */
}
if (blocked)
{
std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
double itertime = time/wec.maxiter;
double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
std::cout << "[Wave][SeqBlk] GFlops/s: " << gflops_s << std::endl;
}
else
{
std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
double itertime = time/wec.maxiter;
double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
std::cout << "[Wave][Sequential] GFlops/s: " << gflops_s << std::endl;
size_t kernel_bytes = 3*sizeof(T)*(params.maxrow*params.maxcol);
double gbytes_s = kernel_bytes/(1e6*itertime);
std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
}
#ifdef HAVE_SILO
if (blocked)
visit_dump(wec.g_curr, "wave_seqblk_lastiter.silo");
else
visit_dump(wec.g_curr, "wave_seq_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}
template<typename T>
double solve_sequential(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,false>(wec);
}
template<typename T>
double solve_sequential_blocked(wave_equation_context<T>& wec)
{
return solve_sequential_aux<T,true>(wec);
}
template<typename T>
double solve_multithread(wave_equation_context<T>& wec, size_t nths)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
/* Multithreading stuff */
std::mutex cv_mtx;
std::condition_variable prod_cv;
std::condition_variable cons_cv;
std::vector<bool> thread_done(nths);
bool iteration_finished = false;
auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
#ifdef DISALLOW_DENORMALS
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
while (1)
{
/* Wait for the producer to notify that there's something to do */
{
std::unique_lock<std::mutex> lck(cv_mtx);
while ( thread_done[thread_id] )
cons_cv.wait(lck);
if (iteration_finished)
return;
}
/* Do the timestep */
wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);
/* Work for this thread finished, notify producer */
std::unique_lock<std::mutex> lck(cv_mtx);
prod_cv.notify_one();
thread_done[thread_id] = true;
}
};
std::vector<std::thread> threads(nths);
for (size_t i = 0; i < nths; i++)
threads[i] = std::thread(thread_lambda, i, nths);
double time = 0.0;
for (size_t iter = 0; iter < wec.maxiter; iter++)
{
auto start = std::chrono::high_resolution_clock::now();
std::unique_lock<std::mutex> lck(cv_mtx);
/* Mark data ready and start the threads */
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
cons_cv.notify_all();
while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
prod_cv.wait(lck);
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (iter%100) == 0 )
{
std::stringstream ss;
ss << "wave_mt_" << iter << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
}
/* Tell all the threads to stop */
{
std::unique_lock<std::mutex> lck(cv_mtx);
for (size_t i = 0; i < nths; i++)
thread_done[i] = false;
iteration_finished = true;
cons_cv.notify_all();
}
/* Wait for all the threads to finish */
for (auto& th : threads)
th.join();
std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
std::cout << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
std::cout << time << "ms" << std::endl;
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}