diff --git a/kokkos-testing/CMakeLists.txt b/kokkos-testing/CMakeLists.txt index 72f2f2bc7a8b4ad2bf45e4a9cb578a40e5163d4f..8f95e8543d926ea45df45f638d5a886118b2c2e6 100644 --- a/kokkos-testing/CMakeLists.txt +++ b/kokkos-testing/CMakeLists.txt @@ -1,6 +1,8 @@ project(kokkos-testing) cmake_minimum_required(VERSION 3.10) +find_package(CUDA) + add_subdirectory(kokkos) add_executable(finite_difference_kokkos finite_difference_kokkos.cpp) diff --git a/kokkos-testing/experimental_data/timings.plot b/kokkos-testing/experimental_data/timings.plot index 692912b43448a809c58ad28285a5e0a5742b702b..e59f1c5cf502e975f6463b4e293836e29c5236d7 100644 --- a/kokkos-testing/experimental_data/timings.plot +++ b/kokkos-testing/experimental_data/timings.plot @@ -1,10 +1,10 @@ set term postscript enhanced color set output 'timing.eps' -set title '2D damped wave equation, 1024x1024 grid' +set title '2D damped wave equation' set style data histogram set style fill solid border -1 set logscale y set grid ytics mytics -plot 'timings2.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col +plot 'timings-float.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bebd399d1929ac667d87d14f2cb5ba40f765530 --- /dev/null +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -0,0 +1,5 @@ +project(fd_catalog) +cmake_minimum_required(VERSION 3.10) + +find_package(CUDA) + diff --git a/kokkos-testing/fd_catalog/fd_dataio.hpp b/kokkos-testing/fd_catalog/fd_dataio.hpp new file mode 100644 index 0000000000000000000000000000000000000000..2fe9efc31ef86761252a78521b94754021a7b22c --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_dataio.hpp @@ -0,0 +1,66 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#pragma once + +#include "fd_grid.hpp" + +#ifdef HAVE_SILO +#include <silo.h> +#endif /* HAVE_SILO */ + +#ifdef HAVE_SILO +template<typename T> +int +visit_dump(const fd_grid<T>& g, const std::string& fn) +{ + static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float"); + + DBfile *db = nullptr; + db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5); + if (!db) + { + std::cout << "Cannot write simulation output" << std::endl; + return -1; + } + + auto size_x = g.grid_cols(); + auto size_y = g.grid_rows(); + + std::vector<T> x(size_x); + std::vector<T> y(size_y); + + auto dx = g.dx(); + auto px = -dx * g.halo(); + for (size_t i = 0; i < size_x; i++) + x.at(i) = px + i*dx; + + auto dy = g.dy(); + auto py = -dy * g.halo(); + for (size_t i = 0; i < size_y; i++) + y.at(i) = py + i*dy; + + int dims[] = { int(size_x), int(size_y) }; + int ndims = 2; + + T *coords[] = {x.data(), y.data()}; + + if (std::is_same<T,float>::value) + { + DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL); + DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL); + } + else + { + DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL); + DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL); + } + + DBClose(db); + return 0; +} +#endif /* HAVE_SILO */ + diff --git a/kokkos-testing/fd_catalog/fd_grid.hpp b/kokkos-testing/fd_catalog/fd_grid.hpp new file mode 100644 index 0000000000000000000000000000000000000000..419954198dd3311ac7a0d6883ad3232bb813a29f --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_grid.hpp @@ -0,0 +1,113 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#pragma once + +#include <vector> +#include <cstdint> + +template<typename T, typename UInt = uint64_t, typename Int = int64_t> +class fd_grid +{ + std::vector<T> m_data; + UInt m_grid_rows, m_grid_cols; + UInt m_domain_rows, m_domain_cols, m_halo; + +public: + fd_grid() + {} + + fd_grid(UInt rows, UInt cols, UInt halo) : + m_grid_rows(rows+2*halo), + m_grid_cols(cols+2*halo), + m_domain_rows(rows), + m_domain_cols(cols), + m_halo(halo), + m_data( (rows+2*halo) * (cols+2*halo) ) + {} + + fd_grid(fd_grid&& other) + { + m_grid_rows = other.m_grid_rows; + m_grid_cols = other.m_grid_cols; + m_domain_rows = other.m_domain_rows; + m_domain_cols = other.m_domain_cols; + m_halo = other.m_halo; + m_data = std::move(other.m_data); + } + + fd_grid& operator=(const fd_grid& other) + { + m_grid_rows = other.m_grid_rows; + m_grid_cols = other.m_grid_cols; + m_domain_rows = other.m_domain_rows; + m_domain_cols = other.m_domain_cols; + m_halo = other.m_halo; + m_data.resize( other.m_data.size() ); + std::copy(other.m_data.begin(), other.m_data.end(), m_data.begin()); + return *this; + } + + fd_grid& operator=(fd_grid&& other) + { + m_grid_rows = other.m_grid_rows; + m_grid_cols = other.m_grid_cols; + m_domain_rows = other.m_domain_rows; + m_domain_cols = other.m_domain_cols; + m_halo = other.m_halo; + m_data = std::move(other.m_data); + return *this; + } + + fd_grid& operator=(const T& val) + { + for (auto& d : m_data) + d = val; + return *this; + } + + /* Negative indices are allowed to access halo */ + T& operator()(Int i, Int j) + { + auto ofs_row = i + m_halo; + assert(ofs_row < m_domain_rows + 2*m_halo); + + auto ofs_col = j + m_halo; + assert(ofs_col < m_domain_cols + 2*m_halo); + + auto grid_cols = m_domain_cols+2*m_halo; + auto ofs = ofs_row * grid_cols + ofs_col; + assert( (ofs >= 0) and (ofs < m_data.size()) ); + return m_data[ofs]; + } + + /* Negative indices are allowed to access halo */ + T operator()(Int i, Int j) const + { + auto ofs_row = i + m_halo; + auto ofs_col = j + m_halo; + auto grid_cols = m_domain_cols+2*m_halo; + auto ofs = ofs_row * grid_cols + ofs_col; + assert( (ofs >= 0) and (ofs < m_data.size()) ); + return m_data[ofs]; + } + + size_t domain_rows() const { return m_domain_rows; } + size_t domain_cols() const { return m_domain_cols; } + size_t halo() const { return m_halo; } + + size_t grid_rows() const { return m_grid_rows; } + size_t grid_cols() const { return m_grid_cols; } + + T* data() { return m_data.data(); } + const T* data() const { return m_data.data(); } + + size_t size() const { return m_data.size(); } + + T dx() const { return 1./(m_domain_cols-1); } + T dy() const { return 1./(m_domain_rows-1); } +}; + diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a98258a4203532aa54267fd7a17f454c6539761e --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_main.cpp @@ -0,0 +1,69 @@ +#include <iostream> +#include <fstream> +#include <cstdio> + +#include "fd_wave_cpu.hpp" + +#ifdef HAVE_CUDA +#include "fd_wave_cuda.hpp" +#endif + +int main(void) +{ +#ifdef SINGLE_PRECISION + using T = float; +#else + using T = double; +#endif + + std::ofstream ofs("timings.txt"); + + /* Make header */ + ofs << "\"SIZE\" \"Seq\" \"SeqBlk\" "; + +#ifdef HAVE_CUDA + ofs << "Cuda "; +#endif + + auto maxthreads = std::thread::hardware_concurrency(); + for (size_t threads = 1; threads <= maxthreads; threads *= 2) + ofs << "\"" << threads << " threads\" "; + ofs << std::endl; + + double time; + + for (size_t sz = 128; sz <= 1024; sz *= 2) + { + wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 500); + + ofs << sz << " "; + + wec.init(); + time = solve_sequential(wec); + ofs << time << " "; + + wec.init(); + time = solve_sequential_blocked(wec); + ofs << time << " "; + +#ifdef HAVE_CUDA + wec.init(); + time = solve_cuda(wec); + ofs << time << " "; +#endif + + auto maxthreads = std::thread::hardware_concurrency(); + + for (size_t threads = 1; threads <= maxthreads; threads *= 2) + { + wec.init(); + time = solve_multithread(wec, threads); + ofs << time << " "; + } + + ofs << std::endl; + } +} + + + diff --git a/kokkos-testing/fd_catalog/fd_wave.hpp b/kokkos-testing/fd_catalog/fd_wave.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b58c88c1ffbbd584d88b790c9f6d12d7162d5cde --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_wave.hpp @@ -0,0 +1,61 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#pragma once + +#define WAVE_8_HALO_SIZE 4 + +template<typename T> +struct wave_2D_params +{ + int maxrow; + int maxcol; + T dt; + T velocity; + T damping; +}; + +template<typename T> +struct wave_equation_context +{ + fd_grid<T> g_prev; + fd_grid<T> g_curr; + fd_grid<T> g_next; + + T velocity; + T damping; + T dt; + int maxiter; + + wave_equation_context(size_t rows, size_t cols, T vel, T damp, T pdt, int pmaxiter) + { + g_prev = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE); + g_curr = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE); + g_next = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE); + velocity = vel; + damping = damp; + dt = pdt; + maxiter = pmaxiter; + + init(); + } + + void init(void) + { + for (size_t i = 0; i < g_curr.domain_rows(); i++) + { + for (size_t j = 0; j < g_curr.domain_cols(); j++) + { + T y = g_curr.dy()*i - 0.3; + T x = g_curr.dx()*j - 0.1; + g_prev(i,j) = -std::exp(-2400*(x*x + y*y)); + g_curr(i,j) = 2*dt*g_prev(i,j); + g_next(i,j) = 0.0; + } + } + } +}; + diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp new file mode 100644 index 0000000000000000000000000000000000000000..99c5395562c3626b0230308d97d7fb0859d10615 --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -0,0 +1,410 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#include <iostream> +#include <thread> +#include <condition_variable> +#include <cassert> +#include <list> +#include <cmath> + +#include "fd_grid.hpp" +#include "fd_dataio.hpp" +#include "fd_wave.hpp" + +struct tile +{ + int dim_x; + int dim_y; + int idx_x; + int idx_y; + + tile() {} + + tile(int dx, int dy, int ix, int iy) + : dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy) + {} +}; + +std::ostream& +operator<<(std::ostream& os, const tile& tl) +{ + os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y; + return os; +} + +/* This is the naive implementation of a wave equation kernel. You can specify + * a tile to compute only on a subdomain. */ +template<typename T> +void +wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, + fd_grid<T>& g_next, + const wave_2D_params<T>& params, tile tl) +{ + int maxrow = params.maxrow; + int maxcol = params.maxcol; + T dt = params.dt; + T c = params.velocity; + T a = params.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + /**** Initialize constants ****/ + static const T w0 = -205.0/72.0; + static const T w1 = 8.0/5.0; + static const T w2 = -1.0/5.0; + static const T w3 = 8.0/315.0; + static const T w4 = -1.0/560.0; + static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + for (size_t i = 0; i < tl.dim_y; i++) + { + for (size_t j = 0; j < tl.dim_x; j++) + { + /* Get X/Y coordinates of this grid point */ + int ofs_x = tl.dim_x * tl.idx_x + j; + int ofs_y = tl.dim_y * tl.idx_y + i; + + if ( (ofs_x < maxcol) and (ofs_y < maxrow) ) + { + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T deriv_x = 0.0; + //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]); + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k); + + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T deriv_y = 0.0; + //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]); + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x); + + T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x); + if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) ) + val = 0; + + g_next(ofs_y, ofs_x) = val; + } + } + } +} + +/* This is a "blocked" kernel. It copies a block on a "private" chunk of + * memory, computes there and stores the result back. This approach looks + * slightly faster than the "naive" kernel. */ + +#define WAVE_8_CPUKER_COLS 56 +#define WAVE_8_CPUKER_ROWS 56 + +template<typename T> +void +wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, + fd_grid<T>& g_next, + const wave_2D_params<T>& params, tile tl) +{ + const int maxrow = params.maxrow; + const int maxcol = params.maxcol; + const T dt = params.dt; + const T c = params.velocity; + const T a = params.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + /**** Initialize constants ****/ + static const T w0 = -205.0/72.0; + static const T w1 = 8.0/5.0; + static const T w2 = -1.0/5.0; + static const T w3 = 8.0/315.0; + static const T w4 = -1.0/560.0; + static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE; + static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE; + + T s_curr[MX_SZ][MY_SZ]; + + for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS) + { + for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS) + { + /* Load working tile */ + for (size_t i = 0; i < MY_SZ; i++) + { + for (size_t j = 0; j < MX_SZ; j++) + { + int ofs_x = tl.idx_x * tl.dim_x + bj + j; + int ofs_y = tl.idx_y * tl.dim_y + bi + i; + + if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) ) + s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE); + } + } + + /* Do finite differences */ + for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++) + { + for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++) + { + /* Get X/Y coordinates of this grid point */ + int ofs_x = tl.idx_x * tl.dim_x + bj + j; + int ofs_y = tl.idx_y * tl.dim_y + bi + i; + + if ( (ofs_x < maxcol) and (ofs_y < maxrow) ) + { + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T deriv_x = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T deriv_y = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; + + T val = (deriv_x + deriv_y) + - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + + (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + + if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) ) + val = 0; + + g_next(ofs_y, ofs_x) = val; + } + } + } + } + } +} + +template<typename T, bool blocked> +double +solve_sequential_aux(wave_equation_context<T>& wec) +{ + /* Simulation parameters */ + wave_2D_params<T> params; + params.maxcol = wec.g_curr.domain_cols(); + params.maxrow = wec.g_curr.domain_rows(); + params.dt = wec.dt; + params.velocity = wec.velocity; + params.damping = wec.damping; + + tile tl; + tl.dim_x = params.maxcol; + tl.dim_y = params.maxrow; + tl.idx_x = 0; + tl.idx_y = 0; + + double time = 0.0; + + std::ofstream ofs("itertime.txt"); + + for (size_t i = 0; i < wec.maxiter; i++) + { + auto start = std::chrono::high_resolution_clock::now(); + + if (blocked) + wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl); + else + wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl); + + auto stop = std::chrono::high_resolution_clock::now(); + + std::chrono::duration<double, std::milli> ms = stop - start; + time += ms.count(); + + std::swap(wec.g_prev, wec.g_curr); + std::swap(wec.g_curr, wec.g_next); + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (i%100) == 0 ) + { + std::stringstream ss; + ss << "wave_tiled_" << i << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + + ofs << i << " " << ms.count() << std::endl; + } + + if (blocked) + { + std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl; + } + else + { + std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl; + } + +#ifdef HAVE_SILO + visit_dump(wec.g_curr, "wave_sequential_lastiter.silo"); +#endif /* HAVE_SILO */ + + return time/wec.maxiter; +} + +template<typename T> +double solve_sequential(wave_equation_context<T>& wec) +{ + return solve_sequential_aux<T,false>(wec); +} + +template<typename T> +double solve_sequential_blocked(wave_equation_context<T>& wec) +{ + return solve_sequential_aux<T,true>(wec); +} + +template<typename T> +double solve_multithread(wave_equation_context<T>& wec, size_t nths) +{ + /* Simulation parameters */ + wave_2D_params<T> params; + params.maxcol = wec.g_curr.domain_cols(); + params.maxrow = wec.g_curr.domain_rows(); + params.dt = wec.dt; + params.velocity = wec.velocity; + params.damping = wec.damping; + + /* Tile sizes */ + size_t tile_size_x = 32; + size_t tile_size_y = 32; + size_t num_tiles_x = params.maxcol/tile_size_x + 1; + size_t num_tiles_y = params.maxrow/tile_size_y + 1; + + + /* Multithreading stuff */ + std::mutex cout_mtx; + std::mutex cv_mtx; + std::condition_variable prod_cv; + std::condition_variable cons_cv; + std::list<tile> tiles; + size_t running_threads; + bool iteration_finished = false; + std::vector<bool> thread_done(nths); + + for (size_t i = 0; i < nths; i++) + thread_done[i] = true; + + auto thread_lambda = [&](int tid) { + while (1) + { + /* Wait for the producer to notify that there's something to do */ + { + std::unique_lock<std::mutex> lck(cv_mtx); + while ( thread_done[tid] ) + cons_cv.wait(lck); + + if (iteration_finished) + { + assert(tiles.size() == 0); + return; + } + } + + /* Fetch work and process data */ + while (1) + { + tile tl; + + { + std::unique_lock<std::mutex> lck(cv_mtx); + if (tiles.size() == 0) + break; + + assert(tiles.size() != 0); + tl = tiles.front(); + tiles.pop_front(); + } + + wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl); + } + + /* Work for this thread finished, notify producer */ + std::unique_lock<std::mutex> lck(cv_mtx); + //assert(running_threads > 0); + running_threads--; + prod_cv.notify_one(); + thread_done[tid] = true; + } + }; + + + std::vector<std::thread> threads(nths); + for (size_t i = 0; i < nths; i++) + threads[i] = std::thread(thread_lambda, i); + + double time = 0.0; + + for (size_t iter = 0; iter < wec.maxiter; iter++) + { + auto start = std::chrono::high_resolution_clock::now(); + std::unique_lock<std::mutex> lck(cv_mtx); + for (size_t i = 0; i < num_tiles_y; i++) + for (size_t j = 0; j < num_tiles_x; j++) + tiles.push_back( tile(tile_size_x, tile_size_y, j, i) ); + + /* Mark data ready and start the threads */ + for (size_t i = 0; i < nths; i++) + thread_done[i] = false; + + cons_cv.notify_all(); + + while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) ) + prod_cv.wait(lck); + + assert(tiles.size() == 0); + + auto stop = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double, std::milli> ms = stop - start; + time += ms.count(); + + std::swap(wec.g_prev, wec.g_curr); + std::swap(wec.g_curr, wec.g_next); + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (iter%100) == 0 ) + { + std::stringstream ss; + ss << "wave_mt_" << iter << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + } + + /* Tell all the threads to stop */ + { + std::unique_lock<std::mutex> lck(cv_mtx); + for (size_t i = 0; i < nths; i++) + thread_done[i] = false; + iteration_finished = true; + cons_cv.notify_all(); + } + + /* Wait for all the threads to finish */ + for (auto& th : threads) + th.join(); + + std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): "; + std::cout << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][MT] Wall Time (" << nths << " threads): "; + std::cout << time << "ms" << std::endl; + +#ifdef HAVE_SILO + visit_dump(wec.g_curr, "wave_tiled_lastiter.silo"); +#endif /* HAVE_SILO */ + + return time/wec.maxiter; +} + diff --git a/kokkos-testing/fd_catalog/fd_wave_cuda.cu b/kokkos-testing/fd_catalog/fd_wave_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..2f4e25ca1caa6a00d779b1363b381b722f1d9f21 --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_wave_cuda.cu @@ -0,0 +1,229 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#include <iostream> + +#include "fd_grid.hpp" +#include "fd_dataio.hpp" +#include "fd_wave.hpp" + +#define DEBUG + +#include "cuda_runtime.h" + +inline cudaError_t checkCuda(cudaError_t result) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (result != cudaSuccess) { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } +#endif + return result; +} + +/* 8th order 2nd derivative weights */ +__constant__ float d_w32[9]; +__constant__ double d_w64[9]; + +#define WAVE_8_KER_ROWS 32 +#define WAVE_8_KER_COLS 32 + +template<typename T> +__global__ void +wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) +{ + // HHHH + // H****H H = halo, * = data + // H****H We load a tile as in figure. This wastes bandwidth + // H****H because halos are loaded two times each, but it is + // H****H the easiest way to remain generic w.r.t. grid size + // HHHH + + __shared__ T s_curr[WAVE_8_KER_ROWS+2*WAVE_8_HALO_SIZE][WAVE_8_KER_COLS+2*WAVE_8_HALO_SIZE]; + + int maxrow = params.maxrow; + int maxcol = params.maxcol; + T dt = params.dt; + T c = params.velocity; + T a = params.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + /* Get X/Y coordinates of this grid point */ + int ofs_x = blockDim.x * blockIdx.x + threadIdx.x + WAVE_8_HALO_SIZE; + int ofs_y = blockDim.y * blockIdx.y + threadIdx.y + WAVE_8_HALO_SIZE; + + /* Get offset of this grid point in the fd grid array */ + int ofs = (maxcol + 2*WAVE_8_HALO_SIZE) * ofs_y + ofs_x; + + if ( (ofs_x < (maxcol + WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow + WAVE_8_HALO_SIZE)) ) + { + int i = threadIdx.y + WAVE_8_HALO_SIZE; + int j = threadIdx.x + WAVE_8_HALO_SIZE; + + s_curr[i][j] = u_curr[ofs]; + + if (threadIdx.x < WAVE_8_HALO_SIZE) + { + /* X-halo */ + int ox = min(WAVE_8_KER_COLS, maxcol-blockDim.x * blockIdx.x); + s_curr[i][j-WAVE_8_HALO_SIZE] = u_curr[ofs-WAVE_8_HALO_SIZE]; + s_curr[i][j+ox] = u_curr[ofs+ox]; + } + + if (threadIdx.y < WAVE_8_HALO_SIZE) + { + /* Y-halo */ + int oy = min(WAVE_8_KER_ROWS, maxrow-blockDim.y * blockIdx.y); + int b_ofs = ofs - WAVE_8_HALO_SIZE * (2*WAVE_8_HALO_SIZE+maxcol); + int t_ofs = ofs + oy * (2*WAVE_8_HALO_SIZE+maxcol); + s_curr[i-WAVE_8_HALO_SIZE][j] = u_curr[b_ofs]; + s_curr[i+oy][j] = u_curr[t_ofs]; + } + + __syncthreads(); + + T* w; + if ( std::is_same<T,float>::value) + w = (T*)d_w32; + if ( std::is_same<T,double>::value) + w = (T*)d_w64; + + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T deriv_x = 0.0; + //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]); + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i][j+k]; + + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T deriv_y = 0.0; + //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]); + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+k][j]; + + T val = (deriv_x + deriv_y) - (1.0 - a*dt) * u_prev[ofs] + (2.0 - a*dt)*s_curr[i][j]; + if ( (ofs_x == WAVE_8_HALO_SIZE) or (ofs_y == WAVE_8_HALO_SIZE) or + (ofs_x == maxcol+WAVE_8_HALO_SIZE-1) or (ofs_y == maxrow+WAVE_8_HALO_SIZE-1) + ) + val = 0; + + u_next[ofs] = val; + } +} + +template<typename T> +double solve_cuda_aux(wave_equation_context<T>& wec) +{ + /* Get device properties */ + cudaDeviceProp prop; + checkCuda( cudaGetDeviceProperties(&prop, 0) ); + printf("Device: %s (Compute Capability: %d.%d)\n", + prop.name, prop.major, prop.minor); + + /**** Initialize constants ****/ + T w0 = -205.0/72.0; + T w1 = 8.0/5.0; + T w2 = -1.0/5.0; + T w3 = 8.0/315.0; + T w4 = -1.0/560.0; + T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + if (std::is_same<T,float>::value) + checkCuda( cudaMemcpyToSymbol(d_w32, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) ); + if (std::is_same<T,double>::value) + checkCuda( cudaMemcpyToSymbol(d_w64, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) ); + + /* Allocate memory on device */ + T* u_prev; + T* u_curr; + T* u_next; + T* u_temp; + + checkCuda( cudaMalloc((void**)&u_prev, wec.g_prev.size()*sizeof(T)) ); + checkCuda( cudaMalloc((void**)&u_curr, wec.g_curr.size()*sizeof(T)) ); + checkCuda( cudaMalloc((void**)&u_next, wec.g_next.size()*sizeof(T)) ); + + /* Copy grids on device */ + checkCuda( cudaMemcpy(u_prev, wec.g_prev.data(), wec.g_prev.size()*sizeof(T), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(u_curr, wec.g_curr.data(), wec.g_curr.size()*sizeof(T), cudaMemcpyHostToDevice) ); + checkCuda( cudaMemcpy(u_next, wec.g_next.data(), wec.g_next.size()*sizeof(T), cudaMemcpyHostToDevice) ); + + /* Launch parameters */ + dim3 grid_size(wec.g_curr.domain_cols()/WAVE_8_KER_COLS + 1, wec.g_curr.domain_rows()/WAVE_8_KER_ROWS + 1); + dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS); + + /* Simulation parameters */ + wave_2D_params<T> params; + params.maxcol = wec.g_curr.domain_cols(); + params.maxrow = wec.g_curr.domain_rows(); + params.dt = wec.dt; + params.velocity = wec.velocity; + params.damping = wec.damping; + + /* Prepare for timing */ + cudaEvent_t startEvent, stopEvent; + checkCuda( cudaEventCreate(&startEvent) ); + checkCuda( cudaEventCreate(&stopEvent) ); + + checkCuda( cudaEventRecord(startEvent, 0) ); + + for (size_t i = 0; i < wec.maxiter; i++) + { + wave_2D_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params); + + + u_temp = u_prev; + u_prev = u_curr; + u_curr = u_next; + u_next = u_temp; + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (i%100) == 0 ) + { + checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) ); + std::stringstream ss; + ss << "wave_cuda_" << i << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + } + + float milliseconds; + checkCuda( cudaEventRecord(stopEvent, 0) ); + checkCuda( cudaEventSynchronize(stopEvent) ); + checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) ); + + + std::cout << "[Wave][Cuda] Iteration Time: " << milliseconds/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl; + +#ifdef HAVE_SILO + checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) ); + visit_dump(wec.g_curr, "wave_cuda_lastiter.silo"); +#endif /* HAVE_SILO */ + + return milliseconds/wec.maxiter; +} + +double solve_cuda(wave_equation_context<float>& wec) +{ + return solve_cuda_aux(wec); +} + +double solve_cuda(wave_equation_context<double>& wec) +{ + return solve_cuda_aux(wec); +} + + + + + + diff --git a/kokkos-testing/fd_catalog/fd_wave_cuda.hpp b/kokkos-testing/fd_catalog/fd_wave_cuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..4638c35d27d6979324ed945d1aaa26df287c0123 --- /dev/null +++ b/kokkos-testing/fd_catalog/fd_wave_cuda.hpp @@ -0,0 +1,10 @@ +/* Performance evaluation of different implementations of finite difference + * schemes. + * + * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group + */ + +#pragma once + +double solve_cuda(wave_equation_context<float>&); +double solve_cuda(wave_equation_context<double>&); \ No newline at end of file diff --git a/kokkos-testing/finite_difference_baseline.cpp b/kokkos-testing/finite_difference_baseline.cpp index 7d58903cb8776d27f49cb5e57877f0c90768aa50..549a7c7725b41d74bcb888fea53159bd017dbb13 100644 --- a/kokkos-testing/finite_difference_baseline.cpp +++ b/kokkos-testing/finite_difference_baseline.cpp @@ -85,7 +85,7 @@ visit_dump(const fd_grid<double>& g, const std::string& fn) return 0; } -//#define HIGH_ORDER +#define HIGH_ORDER #ifdef HIGH_ORDER #define HALO_SIZE 4 @@ -97,16 +97,16 @@ int main( int argc, char* argv[] ) { using namespace std::chrono; - int Nx = 1024; - int Ny = 1024; + int Nx = 500; + int Ny = 500; int halo = HALO_SIZE; int rows = Ny + 2*halo; int cols = Nx + 2*halo; - int timesteps = 500; + int timesteps = 5000; double dx = 1./(Nx-1); double dy = 1./(Ny-1); - double dt = 0.0001; + double dt = 0.001; double velocity = 1; double damping = 0.1; diff --git a/kokkos-testing/finite_differences.cu b/kokkos-testing/finite_differences.cu index e8ae55af658b03ca703221b5f8c90b07fbd9d79c..bbd1dd1ff4331ea1464d327bb4b0e25f574f579f 100644 --- a/kokkos-testing/finite_differences.cu +++ b/kokkos-testing/finite_differences.cu @@ -1,278 +1,48 @@ #include <iostream> #include <fstream> #include <vector> -#include <cstdio> -#include <cassert> -#include <silo.h> - -#define DEBUG - -#define SINGLE_PRECISION - -template<typename T> -class fd_grid -{ - std::vector<T> m_data; - size_t m_rows, m_cols; - -public: - fd_grid() - {} - - fd_grid(size_t rows, size_t cols) : - m_data(rows*cols), m_rows(rows), m_cols(cols) - {} - - fd_grid(fd_grid&& other) - { - std::swap(m_data, other.m_data); - } - - fd_grid& operator=(fd_grid&& other) - { - std::swap(m_data, other.m_data); - return *this; - } - - T operator()(size_t i, size_t j) const - { - return m_data[i*m_cols + j]; - } - - T& operator()(size_t i, size_t j) - { - return m_data[i*m_cols + j]; - } - - size_t rows() const { return m_rows; } - size_t cols() const { return m_cols; } - - T* data() { return m_data.data(); } - const T* data() const { return m_data.data(); } -}; - -template<typename T> -int -visit_dump(const fd_grid<T>& g, const std::string& fn) -{ - static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float"); - - DBfile *db = nullptr; - db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5); - if (!db) - { - std::cout << "Cannot write simulation output" << std::endl; - return -1; - } - - auto size_x = g.cols(); - auto size_y = g.rows(); - - std::vector<T> x(size_x); - std::vector<T> y(size_y); - - for (size_t i = 0; i < size_x; i++) - x.at(i) = T(i)/(size_x-1); - - for (size_t i = 0; i < size_y; i++) - y.at(i) = T(i)/(size_y-1); - - int dims[] = { int(size_y), int(size_y) }; - int ndims = 2; - - T *coords[] = {x.data(), y.data()}; - - if (std::is_same<T,float>::value) - { - DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL); - DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL); - } - else - { - DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL); - DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL); - } - - DBClose(db); - return 0; -} - - - -inline cudaError_t checkCuda(cudaError_t result) -{ -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); - } -#endif - return result; -} - -#ifdef SINGLE_PRECISION -__constant__ float d_w[9]; -#else -__constant__ double d_w[9]; -#endif -template<typename T> -__global__ void test_kernel(T* data) -{ - int ofs_x = blockDim.x * blockIdx.x + threadIdx.x; - int ofs_y = blockDim.y * blockIdx.y + threadIdx.y; - - int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x; - - data[ofs] = gridDim.x * blockIdx.y + blockIdx.x; -} - - -#define GRID_ROWS 32 -#define GRID_COLS 32 -#define HALO_SIZE 4 - -int grid_rows; -int grid_cols; - -template<typename T> -__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next) -{ - __shared__ T s_curr[GRID_COLS+2*HALO_SIZE]; - - /* Get X/Y coordinates of this grid point */ - int ofs_x = blockDim.x * blockIdx.x + threadIdx.x; - int ofs_y = blockDim.y * blockIdx.y + threadIdx.y; - - /* Get offset of this grid point in the fd grid array */ - int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x; - - int shm_ofs = threadIdx.x + HALO_SIZE; - - s_curr[shm_ofs] = u_curr[ofs]; - - __syncthreads(); - - - if (threadIdx.x < 4) - { - s_curr[threadIdx.x] = 0.0; - s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0; - } - - __syncthreads(); - - T dx = 0.0; - for (int k = -HALO_SIZE; k <= HALO_SIZE; k++) - dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k]; - - u_next[ofs] = dx; -} - -#if 0 -template<typename T> -__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next) -{ - __shared__ T s_curr[2*HALO_SIZE][32]; - - /* Get X/Y coordinates of this grid point */ - int ofs_x = blockDim.x * blockIdx.x + threadIdx.x; - int ofs_y = blockDim.y * blockIdx.y + threadIdx.y; - - /* Get offset of this grid point in the fd grid array */ - int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x; +#include <sstream> +#include <cstdio> - int shm_ofs_x = threadIdx.x; - int shm_ofs_y = threadIdx.y; - s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs]; +#include "fd_grid.hpp" +#include "fd_dataio.hpp" +#include "fd_wave_cpu.hpp" - __syncthreads(); - - - if (threadIdx.y < 4) - { - s_curr[threadIdx.x] = 0.0; - s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0; - } - __syncthreads(); - - T dx = 0.0; - for (int k = -HALO_SIZE; k <= HALO_SIZE; k++) - dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k]; - u_next[ofs] = dx; -} -#endif int main(void) { - #ifdef SINGLE_PRECISION using T = float; #else using T = double; #endif - /**** Get device properties ****/ - cudaDeviceProp prop; - checkCuda( cudaGetDeviceProperties(&prop, 0) ); - printf("Device Name: %s\n", prop.name); - printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor); - - /**** Initialize constants ****/ - T w0 = -205.0/72.0; - T w1 = 8.0/5.0; - T w2 = -1.0/5.0; - T w3 = 8.0/315.0; - T w4 = -1.0/560.0; - T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; - checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) ); - - /**** Prepare for timing ****/ - float milliseconds; - cudaEvent_t startEvent, stopEvent; - checkCuda( cudaEventCreate(&startEvent) ); - checkCuda( cudaEventCreate(&stopEvent) ); + wave_equation_context<T> wec(512, 512, 1, 0.1, 0.0001, 5000); - /**** Prepare grid and copy to device ****/ - fd_grid<T> g(GRID_ROWS, GRID_COLS); - for (size_t i = 0; i < GRID_ROWS; i++) - for (size_t j = 0; j < GRID_COLS; j++) - g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS); - //g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS); - - T* u_curr; - T* u_next; - - checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) ); - checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) ); - checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) ); - - dim3 gs(1,GRID_ROWS); - dim3 tpb(GRID_COLS,1); - - checkCuda( cudaEventRecord(startEvent, 0) ); - //for (size_t i = 0; i < 1; i++) - x_kernel<<<gs, tpb>>>(u_curr, u_next); - checkCuda( cudaEventRecord(stopEvent, 0) ); - checkCuda( cudaEventSynchronize(stopEvent) ); - checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) ); - - std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl; +#if 0 +#ifdef __NVCC__ + wec.init(); + solve_cuda(wec); +#endif /* __NVCC__ */ +#endif - checkCuda( cudaDeviceSynchronize() ); + wec.init(); + solve_sequential(wec); - checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) ); - - checkCuda( cudaFree(u_next) ); - checkCuda( cudaFree(u_curr) ); + wec.init(); + solve_sequential_blocked(wec); - visit_dump(g, "cuda.silo"); + auto maxthreads = std::thread::hardware_concurrency(); - return 0; + for (size_t i = 1; i <= maxthreads; i *= 2) + { + wec.init(); + solve_multithread(wec, i); + } } -