From 374afe267e0e184edf040acff6e7c39d36eb1108 Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Mon, 23 Mar 2020 16:49:00 +0100 Subject: [PATCH] Reorganized fd_catalog a bit. --- kokkos-testing/experimental_data/timings.plot | 4 +- kokkos-testing/experimental_data/timings2.txt | 3 - kokkos-testing/fd_catalog/CMakeLists.txt | 27 +- kokkos-testing/fd_catalog/fd_grid.hpp | 28 ++ kokkos-testing/fd_catalog/fd_main.cpp | 11 +- kokkos-testing/fd_catalog/fd_wave.hpp | 5 +- kokkos-testing/fd_catalog/fd_wave_cpu.hpp | 358 +++++++++++------- kokkos-testing/fd_catalog/fd_wave_cuda.cu | 46 ++- .../fd_catalog/finite_difference_kokkos.cpp | 244 ++++++++++++ kokkos-testing/test_gemv.cpp | 37 ++ kokkos-testing/test_mandel.cpp | 135 +++++++ kokkos-testing/test_sumbw.cpp | 87 +++++ 12 files changed, 825 insertions(+), 160 deletions(-) delete mode 100644 kokkos-testing/experimental_data/timings2.txt create mode 100644 kokkos-testing/fd_catalog/finite_difference_kokkos.cpp create mode 100644 kokkos-testing/test_gemv.cpp create mode 100644 kokkos-testing/test_mandel.cpp create mode 100644 kokkos-testing/test_sumbw.cpp diff --git a/kokkos-testing/experimental_data/timings.plot b/kokkos-testing/experimental_data/timings.plot index e59f1c5..3484c34 100644 --- a/kokkos-testing/experimental_data/timings.plot +++ b/kokkos-testing/experimental_data/timings.plot @@ -1,5 +1,5 @@ set term postscript enhanced color -set output 'timing.eps' +set output 'timing-float.eps' set title '2D damped wave equation' @@ -7,4 +7,4 @@ set style data histogram set style fill solid border -1 set logscale y set grid ytics mytics -plot 'timings-float.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col +plot 'timings.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col diff --git a/kokkos-testing/experimental_data/timings2.txt b/kokkos-testing/experimental_data/timings2.txt deleted file mode 100644 index f10583e..0000000 --- a/kokkos-testing/experimental_data/timings2.txt +++ /dev/null @@ -1,3 +0,0 @@ -"order" "Baseline" "Serial" "1 thread" "2 threads" "4 threads" "CUDA baseline" -"2th order" 0.00707879 0.0141107 0.0138042 0.0072022 0.00539084 -"8th order" 0.033213 0.0419782 0.0409406 0.020956 0.0134292 0.0015 diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index 6efe136..a6f85c5 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -1,11 +1,24 @@ cmake_minimum_required(VERSION 3.10 FATAL_ERROR) project(fd_catalog) include(CheckLanguage) +include(FetchContent) - -set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) +option(ENABLE_KOKKOS "Enable Kokkos" ON) +if (ENABLE_KOKKOS) + FetchContent_Declare(kokkos + GIT_REPOSITORY https://github.com/kokkos/kokkos.git + ) + FetchContent_MakeAvailable(kokkos) + + add_executable(finite_difference_kokkos finite_difference_kokkos.cpp) + target_link_libraries(finite_difference_kokkos Kokkos::kokkos siloh5) + #add_definitions(-DHAVE_KOKKOS) + #set(HAVE_KOKKOS TRUE) +endif() + option(ENABLE_CUDA "Enable CUDA if present" ON) if (ENABLE_CUDA) check_language(CUDA) @@ -20,9 +33,15 @@ if (ENABLE_CUDA) endif() set(CMAKE_CXX_FLAGS_DEBUG "-g") -set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELEASEASSERT "-O3 -g -fpermissive") +option(A_SAVE_TS "Save timesteps" OFF) +if (A_SAVE_TS) + add_definitions(-DHAVE_SILO) + add_definitions(-DSAVE_TIMESTEPS) +endif() + option(ENABLE_SINGLE "Enable single precision build" ON) if (ENABLE_SINGLE) if (HAVE_CUDA) @@ -34,6 +53,7 @@ if (ENABLE_SINGLE) add_executable(fd_catalog_single fd_main.cpp) endif() target_compile_definitions(fd_catalog_single PUBLIC -DSINGLE_PRECISION) + target_link_libraries(fd_catalog_single siloh5) endif() option(ENABLE_DOUBLE "Enable double precision build" ON) @@ -46,4 +66,5 @@ if (ENABLE_DOUBLE) else() add_executable(fd_catalog_double fd_main.cpp) endif() + target_link_libraries(fd_catalog_double siloh5) endif() diff --git a/kokkos-testing/fd_catalog/fd_grid.hpp b/kokkos-testing/fd_catalog/fd_grid.hpp index 4199541..6aae1db 100644 --- a/kokkos-testing/fd_catalog/fd_grid.hpp +++ b/kokkos-testing/fd_catalog/fd_grid.hpp @@ -105,6 +105,34 @@ public: T* data() { return m_data.data(); } const T* data() const { return m_data.data(); } + T* data(Int i, Int j) + { + auto ofs_row = i + m_halo; + assert(ofs_row < m_domain_rows + 2*m_halo); + + auto ofs_col = j + m_halo; + assert(ofs_col < m_domain_cols + 2*m_halo); + + auto grid_cols = m_domain_cols+2*m_halo; + auto ofs = ofs_row * grid_cols + ofs_col; + assert( (ofs >= 0) and (ofs < m_data.size()) ); + return m_data.data() + ofs; + } + + const T* data(Int i, Int j) const + { + auto ofs_row = i + m_halo; + assert(ofs_row < m_domain_rows + 2*m_halo); + + auto ofs_col = j + m_halo; + assert(ofs_col < m_domain_cols + 2*m_halo); + + auto grid_cols = m_domain_cols+2*m_halo; + auto ofs = ofs_row * grid_cols + ofs_col; + assert( (ofs >= 0) and (ofs < m_data.size()) ); + return m_data.data() + ofs; + } + size_t size() const { return m_data.size(); } T dx() const { return 1./(m_domain_cols-1); } diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp index a98258a..458d937 100644 --- a/kokkos-testing/fd_catalog/fd_main.cpp +++ b/kokkos-testing/fd_catalog/fd_main.cpp @@ -1,7 +1,7 @@ #include <iostream> #include <fstream> #include <cstdio> - +#include <unistd.h> #include "fd_wave_cpu.hpp" #ifdef HAVE_CUDA @@ -22,11 +22,11 @@ int main(void) ofs << "\"SIZE\" \"Seq\" \"SeqBlk\" "; #ifdef HAVE_CUDA - ofs << "Cuda "; + ofs << "\"Cuda\" "; #endif auto maxthreads = std::thread::hardware_concurrency(); - for (size_t threads = 1; threads <= maxthreads; threads *= 2) + for (size_t threads = 1; threads < maxthreads; threads *= 2) ofs << "\"" << threads << " threads\" "; ofs << std::endl; @@ -34,8 +34,7 @@ int main(void) for (size_t sz = 128; sz <= 1024; sz *= 2) { - wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 500); - + wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); ofs << sz << " "; wec.init(); @@ -54,7 +53,7 @@ int main(void) auto maxthreads = std::thread::hardware_concurrency(); - for (size_t threads = 1; threads <= maxthreads; threads *= 2) + for (size_t threads = 1; threads < maxthreads; threads *= 2) { wec.init(); time = solve_multithread(wec, threads); diff --git a/kokkos-testing/fd_catalog/fd_wave.hpp b/kokkos-testing/fd_catalog/fd_wave.hpp index b58c88c..a57b557 100644 --- a/kokkos-testing/fd_catalog/fd_wave.hpp +++ b/kokkos-testing/fd_catalog/fd_wave.hpp @@ -30,7 +30,10 @@ struct wave_equation_context T dt; int maxiter; - wave_equation_context(size_t rows, size_t cols, T vel, T damp, T pdt, int pmaxiter) + size_t rows, cols; + + wave_equation_context(size_t prows, size_t pcols, T vel, T damp, T pdt, int pmaxiter) + : rows(prows), cols(pcols) { g_prev = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE); g_curr = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE); diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index ae9c2b0..c802f0f 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -10,7 +10,10 @@ #include <cassert> #include <list> #include <cmath> -#include <functional> +#include <algorithm> +#include <sstream> + +#include <string.h> #include "fd_grid.hpp" #include "fd_dataio.hpp" @@ -37,14 +40,18 @@ operator<<(std::ostream& os, const tile& tl) return os; } -/* This is the naive implementation of a wave equation kernel. You can specify - * a tile to compute only on a subdomain. */ +#define BLK_ROWS 56 +#define BLK_COLS 56 + template<typename T> void -wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, - fd_grid<T>& g_next, - const wave_2D_params<T>& params, tile tl) +wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, + fd_grid<T>& g_next, + const wave_2D_params<T>& params) { + /* 1) memcpy is faster than for + * 2) if you put inner block processing, you kill perf + */ int maxrow = params.maxrow; int maxcol = params.maxcol; T dt = params.dt; @@ -62,56 +69,188 @@ wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, static const T w4 = -1.0/560.0; static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; - for (size_t i = 0; i < tl.dim_y; i++) + T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE]; + + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); + + size_t blocks_x = maxcol / BLK_COLS; + size_t blocks_y = maxrow / BLK_ROWS; + size_t rem_x = maxcol % BLK_COLS; + size_t rem_y = maxrow % BLK_ROWS; + + + /* Do fixed-size blocks */ + for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS) + { + for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS) + { + /* Here all the dimensions are known at compile time, let the + * compiler do its job. */ + for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++) + { + int ofs_x = bj - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); + } + + for (size_t i = 0; i < BLK_ROWS; i++) + { + for (size_t j = 0; j < BLK_COLS; j++) + { + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + + assert(j < BLK_COLS); + g_next(bi+i, bj+j) = val; + } + } + } + } + + /* Do upper blocks */ + { + size_t bi = blocks_y*BLK_ROWS; + for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS) + { + for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++) + { + + int ofs_x = bj - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); + } + + for (size_t i = 0; i < rem_y; i++) + { + for (size_t j = 0; j < BLK_COLS; j++) + { + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + + assert(j < BLK_COLS); + g_next(bi+i, bj+j) = val; + } + } + } + } + + /* Do upper leftmost corner */ { - for (size_t j = 0; j < tl.dim_x; j++) + size_t bi = blocks_y*BLK_ROWS; + size_t bj = blocks_x*BLK_COLS; + + for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++) { - /* Get X/Y coordinates of this grid point */ - int ofs_x = tl.dim_x * tl.idx_x + j; - int ofs_y = tl.dim_y * tl.idx_y + i; + + int ofs_x = bj - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); + } - if ( (ofs_x < maxcol) and (ofs_y < maxrow) ) + for (size_t i = 0; i < rem_y; i++) + { + for (size_t j = 0; j < rem_x; j++) { - T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); - T deriv_x = 0.0; - //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]); + T lapl = 0.0; for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k); + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; - T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); - T deriv_y = 0.0; - //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]); for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x); + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + + assert(j < BLK_COLS); + g_next(bi+i, bj+j) = val; + } + } + } + + /* Do leftmost blocks */ + for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS) + { + size_t bj = blocks_x*BLK_COLS; - T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x); - if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) ) - val = 0; + for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++) + { + + int ofs_x = bj - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T)); + } - g_next(ofs_y, ofs_x) = val; + for (size_t i = 0; i < BLK_ROWS; i++) + { + for (size_t j = 0; j < rem_x; j++) + { + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + + assert(j < BLK_COLS); + g_next(bi+i, bj+j) = val; } } } -} -/* This is a "blocked" kernel. It copies a block on a "private" chunk of - * memory, computes there and stores the result back. This approach looks - * slightly faster than the "naive" kernel. */ + /* Apply BCs */ + for (size_t j = 0; j < maxcol; j++) + { + g_next(0,j) = 0; + g_next(maxrow-1, j) = 0; + } -#define WAVE_8_CPUKER_COLS 56 -#define WAVE_8_CPUKER_ROWS 56 + for (size_t i = 0; i < maxrow; i++) + { + g_next(i, 0) = 0; + g_next(i, maxcol-1) = 0; + } +} template<typename T> void -wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, - fd_grid<T>& g_next, - const wave_2D_params<T>& params, tile tl) +wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, + fd_grid<T>& g_next, + const wave_2D_params<T>& params, + size_t thread_id = 0, size_t num_threads = 1) { - const int maxrow = params.maxrow; - const int maxcol = params.maxcol; - const T dt = params.dt; - const T c = params.velocity; - const T a = params.damping; + int maxrow = params.maxrow; + int maxcol = params.maxcol; + T dt = params.dt; + T c = params.velocity; + T a = params.damping; assert(maxcol > 1); assert(maxrow > 1); @@ -124,60 +263,28 @@ wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, static const T w4 = -1.0/560.0; static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; - static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE; - static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE; - - T s_curr[MX_SZ][MY_SZ]; + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); - for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS) + for (size_t i = thread_id; i < maxrow; i += num_threads) { - for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS) + for (size_t j = 0; j < maxcol; j++) { - /* Load working tile */ - for (size_t i = 0; i < MY_SZ; i++) - { - for (size_t j = 0; j < MX_SZ; j++) - { - int ofs_x = tl.idx_x * tl.dim_x + bj + j; - int ofs_y = tl.idx_y * tl.dim_y + bi + i; - - if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) ) - s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE); - } - } - - /* Do finite differences */ - for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++) - { - for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++) - { - /* Get X/Y coordinates of this grid point */ - int ofs_x = tl.idx_x * tl.dim_x + bj + j; - int ofs_y = tl.idx_y * tl.dim_y + bi + i; - - if ( (ofs_x < maxcol) and (ofs_y < maxrow) ) - { - T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); - T deriv_x = 0.0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; - - T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); - T deriv_y = 0.0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; - - T val = (deriv_x + deriv_y) - - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) - + (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i,j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j); - if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) ) - val = 0; + T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j); - g_next(ofs_y, ofs_x) = val; - } - } - } + if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) + val = 0; + + g_next(i, j) = val; } } } @@ -202,16 +309,16 @@ solve_sequential_aux(wave_equation_context<T>& wec) double time = 0.0; - std::ofstream ofs("itertime.txt"); + //std::ofstream ofs("itertime.txt"); for (size_t i = 0; i < wec.maxiter; i++) { auto start = std::chrono::high_resolution_clock::now(); if (blocked) - wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl); + wave_2D_kernel_blk(wec.g_prev, wec.g_curr, wec.g_next, params); else - wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl); + wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params); auto stop = std::chrono::high_resolution_clock::now(); @@ -226,26 +333,37 @@ solve_sequential_aux(wave_equation_context<T>& wec) if ( (i%100) == 0 ) { std::stringstream ss; - ss << "wave_tiled_" << i << ".silo"; + ss << "wave_seq_" << i << ".silo"; visit_dump(wec.g_curr, ss.str()); } #endif /* SAVE_TIMESTEPS */ #endif /* HAVE_SILO */ - ofs << i << " " << ms.count() << std::endl; + //ofs << i << " " << ms.count() << std::endl; } if (blocked) { std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl; + + double itertime = time/wec.maxiter; + double gflops_s = 60*(params.maxrow*params.maxcol)/(1e6*itertime); + std::cout << "[Wave][SeqBlk] GFlops/s: " << gflops_s << std::endl; } else { std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl; - } + + double itertime = time/wec.maxiter; + double gflops_s = 60*(params.maxrow*params.maxcol)/(1e6*itertime); + std::cout << "[Wave][Sequential] GFlops/s: " << gflops_s << std::endl; + size_t kernel_bytes = 3*sizeof(T)*(params.maxrow*params.maxcol); + double gbytes_s = kernel_bytes/(1e6*itertime); + std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl; + } #ifdef HAVE_SILO visit_dump(wec.g_curr, "wave_sequential_lastiter.silo"); #endif /* HAVE_SILO */ @@ -276,95 +394,57 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths) params.velocity = wec.velocity; params.damping = wec.damping; - /* Tile sizes */ - size_t tile_size_x = 32; - size_t tile_size_y = 32; - size_t num_tiles_x = params.maxcol/tile_size_x + 1; - size_t num_tiles_y = params.maxrow/tile_size_y + 1; - /* Multithreading stuff */ - std::mutex cout_mtx; std::mutex cv_mtx; std::condition_variable prod_cv; std::condition_variable cons_cv; - std::list<tile> tiles; - size_t running_threads; - bool iteration_finished = false; std::vector<bool> thread_done(nths); + bool iteration_finished = false; - for (size_t i = 0; i < nths; i++) - thread_done[i] = true; - - auto thread_lambda = [&](int tid) { + auto thread_lambda = [&](size_t thread_id, size_t num_threads) { while (1) { /* Wait for the producer to notify that there's something to do */ { std::unique_lock<std::mutex> lck(cv_mtx); - while ( thread_done[tid] ) + while ( thread_done[thread_id] ) cons_cv.wait(lck); if (iteration_finished) - { - assert(tiles.size() == 0); return; - } } - /* Fetch work and process data */ - while (1) - { - tile tl; - - { - std::unique_lock<std::mutex> lck(cv_mtx); - if (tiles.size() == 0) - break; - - assert(tiles.size() != 0); - tl = tiles.front(); - tiles.pop_front(); - } - - wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl); - } + /* Do the timestep */ + wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads); /* Work for this thread finished, notify producer */ std::unique_lock<std::mutex> lck(cv_mtx); - //assert(running_threads > 0); - running_threads--; prod_cv.notify_one(); - thread_done[tid] = true; + thread_done[thread_id] = true; } }; std::vector<std::thread> threads(nths); for (size_t i = 0; i < nths; i++) - threads[i] = std::thread(thread_lambda, i); + threads[i] = std::thread(thread_lambda, i, nths); double time = 0.0; for (size_t iter = 0; iter < wec.maxiter; iter++) { auto start = std::chrono::high_resolution_clock::now(); - std::unique_lock<std::mutex> lck(cv_mtx); - for (size_t i = 0; i < num_tiles_y; i++) - for (size_t j = 0; j < num_tiles_x; j++) - tiles.push_back( tile(tile_size_x, tile_size_y, j, i) ); + std::unique_lock<std::mutex> lck(cv_mtx); /* Mark data ready and start the threads */ for (size_t i = 0; i < nths; i++) thread_done[i] = false; - cons_cv.notify_all(); while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) ) prod_cv.wait(lck); - assert(tiles.size() == 0); - auto stop = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> ms = stop - start; time += ms.count(); diff --git a/kokkos-testing/fd_catalog/fd_wave_cuda.cu b/kokkos-testing/fd_catalog/fd_wave_cuda.cu index b0edcb4..4ccf0b4 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cuda.cu +++ b/kokkos-testing/fd_catalog/fd_wave_cuda.cu @@ -6,6 +6,7 @@ #include <iostream> #include <cassert> +#include <sstream> #include "fd_grid.hpp" #include "fd_dataio.hpp" @@ -37,6 +38,9 @@ template<typename T> __global__ void wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) { + /* This kernel saturates the memory bandwidth on GTX650M. Memory bw + * was estimated using Mark Harris' copy/transpose code */ + // HHHH // H****H H = halo, * = data // H****H We load a tile as in figure. This wastes bandwidth @@ -44,6 +48,9 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) // H****H the easiest way to remain generic w.r.t. grid size // HHHH +#ifdef USE_SHARED_PREV + __shared__ T s_prev[WAVE_8_KER_ROWS][WAVE_8_KER_COLS]; +#endif __shared__ T s_curr[WAVE_8_KER_ROWS+2*WAVE_8_HALO_SIZE][WAVE_8_KER_COLS+2*WAVE_8_HALO_SIZE]; int maxrow = params.maxrow; @@ -55,6 +62,14 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) assert(maxcol > 1); assert(maxrow > 1); + T c2dt2 = c*c*dt*dt; + T kx2 = c2dt2 * (maxcol-1)*(maxcol-1); + T ky2 = c2dt2 * (maxrow-1)*(maxrow-1); + /* Don't ask me why but if you put the rhs of this in the computation + * of val the kernel runs 2x slower */ + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); + /* Get X/Y coordinates of this grid point */ int ofs_x = blockDim.x * blockIdx.x + threadIdx.x + WAVE_8_HALO_SIZE; int ofs_y = blockDim.y * blockIdx.y + threadIdx.y + WAVE_8_HALO_SIZE; @@ -69,6 +84,9 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) s_curr[i][j] = u_curr[ofs]; +#ifdef USE_SHARED_PREV + s_prev[threadIdx.y][threadIdx.x] = u_prev[ofs]; +#endif if (threadIdx.x < WAVE_8_HALO_SIZE) { /* X-halo */ @@ -95,25 +113,32 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params) if ( std::is_same<T,double>::value) w = (T*)d_w64; - T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); T deriv_x = 0.0; //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]); for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i][j+k]; - - T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T deriv_y = 0.0; //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]); for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+k][j]; - T val = (deriv_x + deriv_y) - (1.0 - a*dt) * u_prev[ofs] + (2.0 - a*dt)*s_curr[i][j]; +#ifdef USE_SHARED_PREV + T val = (deriv_x + deriv_y) + - one_minus_adt * s_prev[threadIdx.y][threadIdx.x] + + two_minus_adt * s_curr[i][j]; +#else + T val = (deriv_x + deriv_y) + - one_minus_adt * u_prev[ofs] + + two_minus_adt * s_curr[i][j]; +#endif + if ( (ofs_x == WAVE_8_HALO_SIZE) or (ofs_y == WAVE_8_HALO_SIZE) or (ofs_x == maxcol+WAVE_8_HALO_SIZE-1) or (ofs_y == maxrow+WAVE_8_HALO_SIZE-1) ) val = 0; - u_next[ofs] = val; + u_next[ofs] = val; } } @@ -175,6 +200,7 @@ double solve_cuda_aux(wave_equation_context<T>& wec) for (size_t i = 0; i < wec.maxiter; i++) { + //sum_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params); wave_2D_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params); @@ -204,13 +230,21 @@ double solve_cuda_aux(wave_equation_context<T>& wec) std::cout << "[Wave][Cuda] Iteration Time: " << milliseconds/wec.maxiter << "ms" << std::endl; std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl; + + double itertime = milliseconds/wec.maxiter; + double gflops_s = 70*(params.maxrow*params.maxcol)/(1e6*itertime); + std::cout << "[Wave][Cuda] GFlops/s: " << gflops_s << std::endl; + + size_t kernel_bytes = (40*40+2*32*32)*sizeof(T)*grid_size.x*grid_size.y; + double gbytes_s = kernel_bytes/(1e6*itertime); + std::cout << "[Wave][Cuda] Bandwidth: " << gbytes_s << "GB/s" << std::endl; #ifdef HAVE_SILO checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) ); visit_dump(wec.g_curr, "wave_cuda_lastiter.silo"); #endif /* HAVE_SILO */ - return milliseconds/wec.maxiter; + return itertime; } double solve_cuda(wave_equation_context<float>& wec) diff --git a/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp b/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp new file mode 100644 index 0000000..9de8a2b --- /dev/null +++ b/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp @@ -0,0 +1,244 @@ +#include <iostream> +#include <fstream> +#include <sstream> +#include <vector> +#include <cmath> +#include <chrono> +#include <type_traits> + +#include <silo.h> +#include <Kokkos_Core.hpp> + +#define WAVE_8_HALO_SIZE 4 + +using namespace Kokkos; +using namespace std::chrono; + +template<typename T> +int +visit_dump(const Kokkos::View<T**>& kv, const std::string& fn) +{ + static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, + "Only double or float"); + + DBfile *db = nullptr; + db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5); + if (!db) + { + std::cout << "Cannot write simulation output" << std::endl; + return -1; + } + + auto size_x = kv.extent(0); + auto size_y = kv.extent(1); + + std::vector<T> x(size_x); + std::vector<T> y(size_y); + + for (size_t i = 0; i < size_x; i++) + x.at(i) = T(i)/(size_x-1); + + for (size_t i = 0; i < size_y; i++) + y.at(i) = T(i)/(size_y-1); + + int dims[] = { int(size_y), int(size_y) }; + int ndims = 2; + T *coords[] = { x.data(), y.data() }; + + if (std::is_same<T,float>::value) + DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, + DB_FLOAT, DB_COLLINEAR, NULL); + + if (std::is_same<T,double>::value) + DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, + DB_DOUBLE, DB_COLLINEAR, NULL); + + std::vector<T> data(x.size() * y.size()); + + for (size_t i = 0; i < x.size(); i++) + for (size_t j = 0; j < y.size(); j++) + data.at(i*y.size()+j) = kv(i,j); + + if (std::is_same<T,float>::value) + DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims, + NULL, 0, DB_FLOAT, DB_NODECENT, NULL); + + if (std::is_same<T,double>::value) + DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims, + NULL, 0, DB_DOUBLE, DB_NODECENT, NULL); + + DBClose(db); + return 0; +} + +template<typename T> +struct wave_equation_context_kokkos +{ + View<T**> g_prev; + View<T**> g_curr; + View<T**> g_next; + + T velocity; + T damping; + T dt; + int maxiter; + + size_t rows, cols; + size_t grows, gcols; + + + wave_equation_context_kokkos(size_t prows, size_t pcols, + T vel, T damp, T pdt, int pmaxiter) + : rows(prows), cols(pcols), + grows(prows+2*WAVE_8_HALO_SIZE), gcols(pcols+2*WAVE_8_HALO_SIZE) + { + + g_prev = View<T**>("g_prev", grows, gcols); + g_curr = View<T**>("g_curr", grows, gcols); + g_next = View<T**>("g_next", grows, gcols); + velocity = vel; + damping = damp; + dt = pdt; + maxiter = pmaxiter; + + init(); + } + + bool is_halo(size_t i, size_t j) + { + if (i < WAVE_8_HALO_SIZE) + return true; + if (j < WAVE_8_HALO_SIZE) + return true; + if (i >= rows+WAVE_8_HALO_SIZE) + return true; + if (j >= cols+WAVE_8_HALO_SIZE) + return true; + return false; + } + + void init(void) + { + auto dx = 1./(cols-1); + auto dy = 1./(rows-1); + + for (size_t i = 0; i < grows; i++) + { + for (size_t j = 0; j < gcols; j++) + { + if ( is_halo(i,j) ) + { + g_prev(i,j) = 0.0; + g_curr(i,j) = 0.0; + g_next(i,j) = 0.0; + } + else + { + T y = dy*i - 0.3; + T x = dx*j - 0.1; + g_prev(i,j) = -std::exp(-2400*(x*x + y*y)); + g_curr(i,j) = 2*dt*g_prev(i,j); + g_next(i,j) = 0.0; + } + } + } + } +}; + + +template<typename T> +double solve_kokkos(wave_equation_context_kokkos<T>& wec) +{ + int maxrow = wec.rows; + int maxcol = wec.cols; + T dt = wec.dt; + T c = wec.velocity; + T a = wec.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + // specifying tiling explicitly worsens perf + MDRangePolicy<Rank<2>> range({0,0}, {maxrow-1, maxcol-1}); + + static const T w0 = -205.0/72.0; + static const T w1 = 8.0/5.0; + static const T w2 = -1.0/5.0; + static const T w3 = 8.0/315.0; + static const T w4 = -1.0/560.0; + static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); + + double iter_time = 0.0; + for (int ts = 0; ts < wec.maxiter; ts++) + { + auto t_begin = high_resolution_clock::now(); + + parallel_for(range, KOKKOS_LAMBDA(int i, int j) + { + i += WAVE_8_HALO_SIZE; + j += WAVE_8_HALO_SIZE; + + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * wec.g_curr(i,j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * wec.g_curr(i+k,j); + + T val = lapl - one_minus_adt * wec.g_prev(i, j) + two_minus_adt * wec.g_curr(i, j); + + if ( (i == 0) or + (j == 0) or + (i == maxrow-1) or + (j == maxcol-1) + ) + val = 0; + + wec.g_next(i, j) = val; + }); + + auto t_end = high_resolution_clock::now(); + + std::swap(wec.g_prev, wec.g_curr); + std::swap(wec.g_curr, wec.g_next); + + std::chrono::duration<double, std::milli> ms = t_end - t_begin; + iter_time += ms.count(); +#if 0 + if ( (ts % 100) == 0 ) + { + std::stringstream ss; + ss << "wave_kokkos_" << ts << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif + } + + double avg_iter_time = iter_time/wec.maxiter; + std::cout << "Average iteration time: " << avg_iter_time << std::endl; + return avg_iter_time; +} + +int main(int argc, char *argv[]) +{ + using T = double; + + Kokkos::initialize( argc, argv ); + + for (size_t sz = 128; sz <= 1024; sz *= 2) + { + wave_equation_context_kokkos<T> wec(sz, sz, 1, 0.1, 0.0001, 500); + wec.init(); + solve_kokkos(wec); + } + + Kokkos::finalize(); + + return 0; +} + diff --git a/kokkos-testing/test_gemv.cpp b/kokkos-testing/test_gemv.cpp new file mode 100644 index 0000000..0616c0b --- /dev/null +++ b/kokkos-testing/test_gemv.cpp @@ -0,0 +1,37 @@ +#include <iostream> +#include <vector> +#include <chrono> + +template<typename T> +void gemv(T alpha, const T *A, const T *x, T beta, T *y, size_t M, size_t N) +{ + for (size_t i = 0; i < M; i++) + for (size_t j = 0; j < N; j++) + y[i] = alpha*A[i*N+j]*x[j] + beta*y[i]; +} + +int main(void) +{ + using T = double; + + static const size_t loops = 10000; + static const size_t M = 1000; + static const size_t N = 1000; + + std::vector<T> A(M*N); + std::vector<T> y(M), x(N); + + T alpha = 0.1; + T beta = 0.1; + + auto start = std::chrono::high_resolution_clock::now(); + for (size_t i = 0; i < loops; i++) + gemv(alpha, A.data(), x.data(), beta, y.data(), M, N); + auto stop = std::chrono::high_resolution_clock::now(); + + std::chrono::duration<double, std::milli> ms = stop - start; + + std::cout << "GEMV time: " << ms.count()/loops << " ms" << std::endl; + + return 0; +} diff --git a/kokkos-testing/test_mandel.cpp b/kokkos-testing/test_mandel.cpp new file mode 100644 index 0000000..6365842 --- /dev/null +++ b/kokkos-testing/test_mandel.cpp @@ -0,0 +1,135 @@ +#include <iostream> +#include <fstream> +#include <vector> +#include <chrono> + +#define YMIN (-1.5) +#define YMAX (1.5) +#define XMIN (-2.5) +#define XMAX (1.5) + +#define SIZE_Y 2560 +#define SIZE_X 2048 +#define GPU_BLOCK_X_SZ 32 +#define GPU_BLOCK_Y_SZ 32 + +#define SY_TO_PLANE(y) (YMIN + y*(YMAX-YMIN)/SIZE_Y) +#define SX_TO_PLANE(x) (XMIN + x*(XMAX-XMIN)/SIZE_X) + +#ifdef __NVCC__ + +template<typename T> +__global__ +void gpu_mandel(T *mandel, int maxiter) +{ + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = blockDim.y * blockIdx.y + threadIdx.y; + + if (j >= SIZE_X or i >= SIZE_Y) + return; + + T z_re = 0; + T z_im = 0; + T c_re = SX_TO_PLANE(j); // 4 ops + T c_im = SY_TO_PLANE(i); // 4 ops + + + T zabs2 = (z_re * z_re) + (z_im * z_im); // 3 ops + int iter = 0; + // = 11 ops always + while (zabs2 < 4.0 && iter < maxiter) + { + T z_re2 = z_re*z_re; // 1 op + T z_im2 = z_im*z_im; // 1 op + T z_ri = 2*z_re*z_im; // 2 ops + + z_re = z_re2 - z_im2 + c_re; // 2 ops + z_im = z_ri + c_im; // 1 op + + zabs2 = z_re2 + z_im2; // 1 op + iter++; + } + // = 8 ops * iter + if (iter == maxiter) + mandel[i*SIZE_X + j] = 0.0; + else + mandel[i*SIZE_X + j] = iter; +} + +#endif /* __NVCC__ */ + +template<typename T> +void compute_gpu(std::vector<T>& mandel, int maxiter) +{ + dim3 grid_size(SIZE_X/GPU_BLOCK_SIZE_X + 1, SIZE_Y/GPU_BLOCK_SIZE_Y + 1); + dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS); +} + +int main(void) +{ + size_t maxiter = 1000; + using T = double; + + std::vector<T> mandel(SIZE_X*SIZE_Y); + + auto start = std::chrono::high_resolution_clock::now(); + + size_t flops = 0; + + #pragma omp parallel for + for (size_t i = 0; i < SIZE_Y; i++) + { + for (size_t j = 0; j < SIZE_X; j++) + { + T z_re = 0; + T z_im = 0; + T c_re = SX_TO_PLANE(j); // 4 ops + T c_im = SY_TO_PLANE(i); // 4 ops + + + T zabs2 = (z_re * z_re) + (z_im * z_im); // 3 ops + size_t iter = 0; + // = 11 ops always + while (zabs2 < 4.0 && iter < maxiter) + { + T z_re2 = z_re*z_re; // 1 op + T z_im2 = z_im*z_im; // 1 op + T z_ri = 2*z_re*z_im; // 2 ops + + z_re = z_re2 - z_im2 + c_re; // 2 ops + z_im = z_ri + c_im; // 1 op + + zabs2 = z_re2 + z_im2; // 1 op + iter++; + } + // = 8 ops * iter + if (iter == maxiter) + mandel[i*SIZE_X + j] = 0.0; + else + mandel[i*SIZE_X + j] = iter; + } + } + + for (size_t i = 0; i < SIZE_Y; i++) + for (size_t j = 0; j < SIZE_X; j++) + if (size_t ni = mandel[i*SIZE_X+j]; ni != 0) + flops += 11+8*ni; + else + flops += 11+8*maxiter; + + auto stop = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double, std::milli> s = stop - start; + + std::cout << s.count() << std::endl; + std::cout << flops << std::endl; + + std::cout << "GFlops/s: " << double(flops)/(1e6*s.count()) << std::endl; + + std::ofstream ofs("mandelbrot.txt"); + for (size_t i = 0; i < SIZE_Y; i++) + for (size_t j = 0; j < SIZE_X; j++) + ofs << j << " " << i << " " << mandel[i*SIZE_X + j] << std::endl; + + + return 0; +} diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp new file mode 100644 index 0000000..4ee808e --- /dev/null +++ b/kokkos-testing/test_sumbw.cpp @@ -0,0 +1,87 @@ +#include <iostream> +#include <fstream> +#include <vector> +#include <chrono> +#include <numeric> + +extern "C" { + +void sum_restrict(const double * __restrict__ prev, + const double * __restrict__ curr, + double * __restrict__ next, + size_t sz) +{ + for (size_t i = 0; i < sz; i++) + next[i] = curr[i] + prev[i]; +} +} + +template<typename T> +void +sum(const std::vector<T>& prev, const std::vector<T>& curr, + std::vector<T>& next) +{ + if ( not (prev.size() == curr.size() and curr.size() == next.size()) ) + return; + + for (size_t i = 0; i < curr.size(); i++) + next[i] = curr[i] + prev[i]; +} + +int main(void) +{ + using T = double; + + std::vector<std::vector<double>> itertimes; + + size_t maxiter = 500; + + for (size_t sz = 128; sz <= 8192; sz *= 2) + { + std::vector<T> prev(sz*sz); + std::vector<T> curr(sz*sz); + std::vector<T> next(sz*sz); + + for(size_t i = 0; i < sz*sz; i++) + { + prev[i] = i + 1; + curr[i] = i % 18; + next[i] = 37*i; + } + + std::vector<double> itertime(maxiter); + + for (size_t iter = 0; iter < maxiter; iter++) + { + auto start = std::chrono::high_resolution_clock::now(); + sum_restrict(prev.data(), curr.data(), next.data(), sz*sz); + //memcpy(next.data(), curr.data(), sz*sz*sizeof(T)); + std::swap(prev, curr); + std::swap(curr, next); + auto stop = std::chrono::high_resolution_clock::now(); + + std::chrono::duration<double, std::milli> ms = stop - start; + itertime[iter] = ms.count(); + } + + auto time = std::accumulate(itertime.begin(), itertime.end(), 0.0) / maxiter; + std::cout << "Sum bandwidth: " << 3*sizeof(T)*sz*sz/(1e6*time); + std::cout << " GB/s" << std::endl; + + itertimes.push_back( std::move(itertime) ); + } + + std::ofstream ofs("itertimes.txt"); + + for (size_t i = 0; i < maxiter; i++) + { + for (size_t j = 0; j < itertimes.size(); j++) + { + ofs << itertimes[j][i] << " "; + } + ofs << std::endl; + } +} + + + -- GitLab