Added modular finite difference test code 'fd_catalog'.

f0386bda · Matteo Cicuttin · cd0cbd2e · f0386bda · f0386bda · f0386bda
Commit f0386bda authored 5 years ago by Matteo Cicuttin
--- a/kokkos-testing/CMakeLists.txt
+++ b/kokkos-testing/CMakeLists.txt
 project(kokkos-testing)
 cmake_minimum_required(VERSION 3.10)
+find_package(CUDA)
 add_subdirectory(kokkos)
 add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)

--- a/kokkos-testing/experimental_data/timings.plot
+++ b/kokkos-testing/experimental_data/timings.plot
 set term postscript enhanced color
 set output 'timing.eps'
-set title '2D damped wave equation, 1024x1024 grid'
+set title '2D damped wave equation'
 set style data histogram
 set style fill solid border -1
 set logscale y
 set grid ytics mytics
-plot 'timings2.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col
+plot 'timings-float.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
+project(fd_catalog)
+cmake_minimum_required(VERSION 3.10)
+find_package(CUDA)
--- a/kokkos-testing/fd_catalog/fd_dataio.hpp
+++ b/kokkos-testing/fd_catalog/fd_dataio.hpp
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#pragma once
+#include "fd_grid.hpp"
+#ifdef HAVE_SILO
+#include <silo.h>
+#endif /* HAVE_SILO */
+#ifdef HAVE_SILO
+template<typename T>
+int
+visit_dump(const fd_grid<T>& g, const std::string& fn)
+{
+    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
+    DBfile *db = nullptr;
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5);
+    if (!db)
+    {
+        std::cout << "Cannot write simulation output" << std::endl;
+        return -1;
+    }
+    auto size_x = g.grid_cols();
+    auto size_y = g.grid_rows();
+    std::vector<T> x(size_x);
+    std::vector<T> y(size_y);
+    auto dx = g.dx();
+    auto px = -dx * g.halo();
+    for (size_t i = 0; i < size_x; i++)
+        x.at(i) = px + i*dx;
+    auto dy = g.dy();
+    auto py = -dy * g.halo();
+    for (size_t i = 0; i < size_y; i++)
+        y.at(i) = py + i*dy;
+    int dims[] = { int(size_x), int(size_y) };
+    int ndims = 2;
+    T *coords[] = {x.data(), y.data()};
+    if (std::is_same<T,float>::value)
+    {
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);    
+        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
+    }
+    else
+    {
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);    
+        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
+    }
+    DBClose(db);
+    return 0;
+}
+#endif /* HAVE_SILO */
--- a/kokkos-testing/fd_catalog/fd_grid.hpp
+++ b/kokkos-testing/fd_catalog/fd_grid.hpp
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#pragma once
+#include <vector>
+#include <cstdint>
+template<typename T, typename UInt = uint64_t, typename Int = int64_t>
+class fd_grid
+{
+    std::vector<T>  m_data;
+    UInt    m_grid_rows, m_grid_cols;
+    UInt    m_domain_rows, m_domain_cols, m_halo;
+public:
+    fd_grid()
+    {}
+    fd_grid(UInt rows, UInt cols, UInt halo) :
+        m_grid_rows(rows+2*halo),
+        m_grid_cols(cols+2*halo),
+        m_domain_rows(rows),
+        m_domain_cols(cols),
+        m_halo(halo),
+        m_data( (rows+2*halo) * (cols+2*halo) )
+    {}
+    fd_grid(fd_grid&& other)
+    {
+        m_grid_rows = other.m_grid_rows;
+        m_grid_cols = other.m_grid_cols;
+        m_domain_rows = other.m_domain_rows;
+        m_domain_cols = other.m_domain_cols;
+        m_halo = other.m_halo;
+        m_data = std::move(other.m_data);
+    }
+    fd_grid& operator=(const fd_grid& other)
+    {
+        m_grid_rows = other.m_grid_rows;
+        m_grid_cols = other.m_grid_cols;
+        m_domain_rows = other.m_domain_rows;
+        m_domain_cols = other.m_domain_cols;
+        m_halo = other.m_halo;
+        m_data.resize( other.m_data.size() );
+        std::copy(other.m_data.begin(), other.m_data.end(), m_data.begin());
+        return *this;
+    }
+    fd_grid& operator=(fd_grid&& other)
+    {
+        m_grid_rows = other.m_grid_rows;
+        m_grid_cols = other.m_grid_cols;
+        m_domain_rows = other.m_domain_rows;
+        m_domain_cols = other.m_domain_cols;
+        m_halo = other.m_halo;
+        m_data = std::move(other.m_data);
+        return *this;
+    }
+    fd_grid& operator=(const T& val)
+    {
+        for (auto& d : m_data)
+            d = val;
+        return *this;
+    }
+    /* Negative indices are allowed to access halo */
+    T& operator()(Int i, Int j)
+    {
+        auto ofs_row        = i + m_halo;
+        assert(ofs_row < m_domain_rows + 2*m_halo);
+        auto ofs_col        = j + m_halo;
+        assert(ofs_col < m_domain_cols + 2*m_halo);
+        auto grid_cols      = m_domain_cols+2*m_halo;
+        auto ofs            = ofs_row * grid_cols + ofs_col;
+        assert( (ofs >= 0) and (ofs < m_data.size()) );
+        return m_data[ofs];
+    }
+    /* Negative indices are allowed to access halo */
+    T operator()(Int i, Int j) const
+    {
+        auto ofs_row        = i + m_halo;
+        auto ofs_col        = j + m_halo;
+        auto grid_cols      = m_domain_cols+2*m_halo;
+        auto ofs            = ofs_row * grid_cols + ofs_col;
+        assert( (ofs >= 0) and (ofs < m_data.size()) );
+        return m_data[ofs];
+    }
+    size_t domain_rows() const { return m_domain_rows; }
+    size_t domain_cols() const { return m_domain_cols; }
+    size_t halo()        const { return m_halo; }
+    size_t grid_rows()   const { return m_grid_rows; }
+    size_t grid_cols()   const { return m_grid_cols; }
+    T*          data()          { return m_data.data(); }
+    const T*    data() const    { return m_data.data(); }
+    size_t      size() const    { return m_data.size(); }
+    T           dx() const { return 1./(m_domain_cols-1); }
+    T           dy() const { return 1./(m_domain_rows-1); }
+};
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
+#include <iostream>
+#include <fstream>
+#include <cstdio>
+#include "fd_wave_cpu.hpp"
+#ifdef HAVE_CUDA
+#include "fd_wave_cuda.hpp"
+#endif
+int main(void)
+{
+#ifdef SINGLE_PRECISION
+    using T = float;
+#else
+    using T = double;
+#endif
+    std::ofstream ofs("timings.txt");
+    /* Make header */
+    ofs << "\"SIZE\"    \"Seq\"    \"SeqBlk\"    ";
+#ifdef HAVE_CUDA
+    ofs << "Cuda    ";
+#endif
+    auto maxthreads = std::thread::hardware_concurrency();
+    for (size_t threads = 1; threads <= maxthreads; threads *= 2)
+        ofs << "\"" << threads << " threads\"    ";
+    ofs << std::endl;
+    double time;
+    for (size_t sz = 128; sz <= 1024; sz *= 2)
+    {
+        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 500);
+        ofs << sz << "    ";
+        wec.init();
+        time = solve_sequential(wec);
+        ofs << time << "    ";
+        wec.init();
+        time = solve_sequential_blocked(wec);
+        ofs << time << "    ";
+#ifdef HAVE_CUDA
+        wec.init();
+        time = solve_cuda(wec);
+        ofs << time << "    ";
+#endif
+        auto maxthreads = std::thread::hardware_concurrency();
+        for (size_t threads = 1; threads <= maxthreads; threads *= 2)
+        {
+            wec.init();
+            time = solve_multithread(wec, threads);
+            ofs << time << "    ";
+        }
+        ofs << std::endl;
+    }
+}
--- a/kokkos-testing/fd_catalog/fd_wave.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave.hpp
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#pragma once
+#define WAVE_8_HALO_SIZE           4
+template<typename T>
+struct wave_2D_params
+{
+    int maxrow;
+    int maxcol;
+    T   dt;
+    T   velocity;
+    T   damping;
+};
+template<typename T>
+struct wave_equation_context
+{
+    fd_grid<T> g_prev;
+    fd_grid<T> g_curr;
+    fd_grid<T> g_next;
+    T       velocity;
+    T       damping;
+    T       dt;
+    int     maxiter;
+    wave_equation_context(size_t rows, size_t cols, T vel, T damp, T pdt, int pmaxiter)
+    {
+        g_prev = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
+        g_curr = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
+        g_next = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
+        velocity = vel;
+        damping = damp;
+        dt = pdt;
+        maxiter = pmaxiter;
+        init();
+    }
+    void init(void)
+    {
+        for (size_t i = 0; i < g_curr.domain_rows(); i++)
+        {
+            for (size_t j = 0; j < g_curr.domain_cols(); j++)
+            {
+                T y = g_curr.dy()*i - 0.3;
+                T x = g_curr.dx()*j - 0.1;
+                g_prev(i,j) = -std::exp(-2400*(x*x + y*y));
+                g_curr(i,j) = 2*dt*g_prev(i,j);
+                g_next(i,j) = 0.0;
+            }
+        }
+    }
+};
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#include <iostream>
+#include <thread>
+#include <condition_variable>
+#include <cassert>
+#include <list>
+#include <cmath>
+#include "fd_grid.hpp"
+#include "fd_dataio.hpp"
+#include "fd_wave.hpp"
+struct tile
+{
+    int dim_x;
+    int dim_y;
+    int idx_x;
+    int idx_y;
+    tile() {}
+    tile(int dx, int dy, int ix, int iy)
+        : dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
+    {}
+};
+std::ostream&
+operator<<(std::ostream& os, const tile& tl)
+{
+    os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
+    return os;
+}
+/* This is the naive implementation of a wave equation kernel. You can specify
+ * a tile to compute only on a subdomain. */
+template<typename T>
+void
+wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
+                     fd_grid<T>& g_next,
+                     const wave_2D_params<T>& params, tile tl)
+{
+    int maxrow  = params.maxrow;
+    int maxcol  = params.maxcol;
+    T   dt      = params.dt;
+    T   c       = params.velocity;
+    T   a       = params.damping;
+    assert(maxcol > 1);
+    assert(maxrow > 1);
+    /**** Initialize constants ****/
+    static const T w0 = -205.0/72.0;
+    static const T w1 = 8.0/5.0;
+    static const T w2 = -1.0/5.0;
+    static const T w3 = 8.0/315.0;
+    static const T w4 = -1.0/560.0;
+    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    for (size_t i = 0; i < tl.dim_y; i++)
+    {
+        for (size_t j = 0; j < tl.dim_x; j++)
+        {
+            /* Get X/Y coordinates of this grid point */
+            int ofs_x = tl.dim_x * tl.idx_x + j;
+            int ofs_y = tl.dim_y * tl.idx_y + i;
+            if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
+            {
+                T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+                T deriv_x = 0.0;
+                //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k);
+                T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+                T deriv_y = 0.0;
+                //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x);
+                T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x);
+                if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
+                    val = 0;
+                g_next(ofs_y, ofs_x) = val;
+            }
+        }
+    }
+}
+/* This is a "blocked" kernel. It copies a block on a "private" chunk of
+ * memory, computes there and stores the result back. This approach looks
+ * slightly faster than the "naive" kernel. */
+#define WAVE_8_CPUKER_COLS  56
+#define WAVE_8_CPUKER_ROWS  56
+template<typename T>
+void
+wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
+                     fd_grid<T>& g_next,
+                     const wave_2D_params<T>& params, tile tl)
+{
+    const int maxrow  = params.maxrow;
+    const int maxcol  = params.maxcol;
+    const T   dt      = params.dt;
+    const T   c       = params.velocity;
+    const T   a       = params.damping;
+    assert(maxcol > 1);
+    assert(maxrow > 1);
+    /**** Initialize constants ****/
+    static const T w0 = -205.0/72.0;
+    static const T w1 = 8.0/5.0;
+    static const T w2 = -1.0/5.0;
+    static const T w3 = 8.0/315.0;
+    static const T w4 = -1.0/560.0;
+    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE;
+    static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE;
+    T s_curr[MX_SZ][MY_SZ];
+    for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS)
+    {
+        for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS)
+        {
+            /* Load working tile */
+            for (size_t i = 0; i < MY_SZ; i++)
+            {
+                for (size_t j = 0; j < MX_SZ; j++)
+                {
+                    int ofs_x = tl.idx_x * tl.dim_x + bj + j;
+                    int ofs_y = tl.idx_y * tl.dim_y + bi + i;
+                    if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) )
+                        s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE);
+                }
+            }
+            /* Do finite differences */
+            for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++)
+            {
+                for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++)
+                {
+                    /* Get X/Y coordinates of this grid point */
+                    int ofs_x = tl.idx_x * tl.dim_x + bj + j;
+                    int ofs_y = tl.idx_y * tl.dim_y + bi + i;
+                    if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
+                    {
+                        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+                        T deriv_x = 0.0;
+                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                            deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+                        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+                        T deriv_y = 0.0;
+                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                            deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+                        T val = (deriv_x + deriv_y)
+                              - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) 
+                              + (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                        if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
+                            val = 0;
+                        g_next(ofs_y, ofs_x) = val;
+                    }
+                }
+            }
+        }
+    }
+}
+template<typename T, bool blocked>
+double
+solve_sequential_aux(wave_equation_context<T>& wec)
+{
+    /* Simulation parameters */
+    wave_2D_params<T> params;
+    params.maxcol = wec.g_curr.domain_cols();
+    params.maxrow = wec.g_curr.domain_rows();
+    params.dt = wec.dt;
+    params.velocity = wec.velocity;
+    params.damping = wec.damping;
+    tile tl;
+    tl.dim_x = params.maxcol;
+    tl.dim_y = params.maxrow;
+    tl.idx_x = 0;
+    tl.idx_y = 0;
+    double time = 0.0;
+    std::ofstream ofs("itertime.txt");
+    for (size_t i = 0; i < wec.maxiter; i++)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        if (blocked)
+            wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
+        else
+            wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
+        auto stop = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> ms = stop - start;
+        time += ms.count();
+        std::swap(wec.g_prev, wec.g_curr);
+        std::swap(wec.g_curr, wec.g_next);
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (i%100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "wave_tiled_" << i << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+        ofs << i << " " << ms.count() << std::endl;
+    }
+    if (blocked)
+    {
+        std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
+        std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
+    }
+    else
+    {
+        std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
+        std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
+    }
+#ifdef HAVE_SILO
+    visit_dump(wec.g_curr, "wave_sequential_lastiter.silo");
+#endif /* HAVE_SILO */
+    return time/wec.maxiter;
+}
+template<typename T>
+double solve_sequential(wave_equation_context<T>& wec)
+{
+    return solve_sequential_aux<T,false>(wec);
+}
+template<typename T>
+double solve_sequential_blocked(wave_equation_context<T>& wec)
+{
+    return solve_sequential_aux<T,true>(wec);
+}
+template<typename T>
+double solve_multithread(wave_equation_context<T>& wec, size_t nths)
+{
+    /* Simulation parameters */
+    wave_2D_params<T> params;
+    params.maxcol = wec.g_curr.domain_cols();
+    params.maxrow = wec.g_curr.domain_rows();
+    params.dt = wec.dt;
+    params.velocity = wec.velocity;
+    params.damping = wec.damping;
+    /* Tile sizes */
+    size_t tile_size_x = 32;
+    size_t tile_size_y = 32;
+    size_t num_tiles_x = params.maxcol/tile_size_x + 1;
+    size_t num_tiles_y = params.maxrow/tile_size_y + 1;
+    /* Multithreading stuff */
+    std::mutex                  cout_mtx;
+    std::mutex                  cv_mtx;
+    std::condition_variable     prod_cv;
+    std::condition_variable     cons_cv;
+    std::list<tile>             tiles;
+    size_t                      running_threads;
+    bool                        iteration_finished = false;
+    std::vector<bool>           thread_done(nths);
+    for (size_t i = 0; i < nths; i++)
+        thread_done[i] = true;
+    auto thread_lambda = [&](int tid) {
+        while (1)
+        {
+            /* Wait for the producer to notify that there's something to do */
+            {
+                std::unique_lock<std::mutex> lck(cv_mtx);
+                while ( thread_done[tid] )
+                    cons_cv.wait(lck);
+                if (iteration_finished)
+                {
+                    assert(tiles.size() == 0);
+                    return;
+                }
+            }
+            /* Fetch work and process data */
+            while (1)
+            {
+                tile tl;
+                {
+                    std::unique_lock<std::mutex> lck(cv_mtx);
+                    if (tiles.size() == 0)
+                        break;
+                    assert(tiles.size() != 0);
+                    tl = tiles.front();
+                    tiles.pop_front();
+                }
+                wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
+            }
+            /* Work for this thread finished, notify producer */
+            std::unique_lock<std::mutex> lck(cv_mtx);
+            //assert(running_threads > 0);
+            running_threads--;
+            prod_cv.notify_one();
+            thread_done[tid] = true;
+        }
+    };
+    std::vector<std::thread> threads(nths);
+    for (size_t i = 0; i < nths; i++)
+        threads[i] = std::thread(thread_lambda, i);
+    double time = 0.0;
+    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        std::unique_lock<std::mutex> lck(cv_mtx);
+        for (size_t i = 0; i < num_tiles_y; i++)
+            for (size_t j = 0; j < num_tiles_x; j++)
+                tiles.push_back( tile(tile_size_x, tile_size_y, j, i) );
+        /* Mark data ready and start the threads */
+        for (size_t i = 0; i < nths; i++)
+            thread_done[i] = false;
+        cons_cv.notify_all();
+        while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
+            prod_cv.wait(lck);
+        assert(tiles.size() == 0);
+        auto stop = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> ms = stop - start;
+        time += ms.count();
+        std::swap(wec.g_prev, wec.g_curr);
+        std::swap(wec.g_curr, wec.g_next);
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (iter%100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "wave_mt_" << iter << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+    }
+    /* Tell all the threads to stop */
+    {
+        std::unique_lock<std::mutex> lck(cv_mtx);
+        for (size_t i = 0; i < nths; i++)
+            thread_done[i] = false;
+        iteration_finished = true;
+        cons_cv.notify_all();
+    }
+    /* Wait for all the threads to finish */
+    for (auto& th : threads)
+        th.join();
+    std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
+    std::cout << time/wec.maxiter << "ms" << std::endl;
+    std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
+    std::cout << time << "ms" << std::endl;
+#ifdef HAVE_SILO
+    visit_dump(wec.g_curr, "wave_tiled_lastiter.silo");
+#endif /* HAVE_SILO */
+    return time/wec.maxiter;
+}
--- a/kokkos-testing/fd_catalog/fd_wave_cuda.cu
+++ b/kokkos-testing/fd_catalog/fd_wave_cuda.cu
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#include <iostream>
+#include "fd_grid.hpp"
+#include "fd_dataio.hpp"
+#include "fd_wave.hpp"
+#define DEBUG
+#include "cuda_runtime.h"
+inline cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+  if (result != cudaSuccess) {
+    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+    assert(result == cudaSuccess);
+  }
+#endif
+  return result;
+}
+/* 8th order 2nd derivative weights */
+__constant__ float  d_w32[9];
+__constant__ double d_w64[9];
+#define WAVE_8_KER_ROWS           32
+#define WAVE_8_KER_COLS           32
+template<typename T>
+__global__ void
+wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
+{
+    //     HHHH
+    //    H****H     H = halo, * = data
+    //    H****H     We load a tile as in figure. This wastes bandwidth
+    //    H****H     because halos are loaded two times each, but it is
+    //    H****H     the easiest way to remain generic w.r.t. grid size
+    //     HHHH
+    __shared__ T s_curr[WAVE_8_KER_ROWS+2*WAVE_8_HALO_SIZE][WAVE_8_KER_COLS+2*WAVE_8_HALO_SIZE];
+    int maxrow  = params.maxrow;
+    int maxcol  = params.maxcol;
+    T   dt      = params.dt;
+    T   c       = params.velocity;
+    T   a       = params.damping;
+    assert(maxcol > 1);
+    assert(maxrow > 1);
+    /* Get X/Y coordinates of this grid point */
+    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x + WAVE_8_HALO_SIZE;
+    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y + WAVE_8_HALO_SIZE;
+    /* Get offset of this grid point in the fd grid array */
+    int ofs = (maxcol + 2*WAVE_8_HALO_SIZE) * ofs_y + ofs_x;
+    if ( (ofs_x < (maxcol + WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow + WAVE_8_HALO_SIZE)) )
+    {
+        int i = threadIdx.y + WAVE_8_HALO_SIZE;
+        int j = threadIdx.x + WAVE_8_HALO_SIZE;
+        s_curr[i][j] = u_curr[ofs];
+        if (threadIdx.x < WAVE_8_HALO_SIZE)
+        {
+            /* X-halo */
+            int ox = min(WAVE_8_KER_COLS, maxcol-blockDim.x * blockIdx.x);
+            s_curr[i][j-WAVE_8_HALO_SIZE] = u_curr[ofs-WAVE_8_HALO_SIZE];
+            s_curr[i][j+ox] = u_curr[ofs+ox];
+        }
+        if (threadIdx.y < WAVE_8_HALO_SIZE)
+        {
+            /* Y-halo */
+            int oy = min(WAVE_8_KER_ROWS, maxrow-blockDim.y * blockIdx.y);
+            int b_ofs = ofs - WAVE_8_HALO_SIZE * (2*WAVE_8_HALO_SIZE+maxcol);
+            int t_ofs = ofs + oy * (2*WAVE_8_HALO_SIZE+maxcol);
+            s_curr[i-WAVE_8_HALO_SIZE][j]  = u_curr[b_ofs];
+            s_curr[i+oy][j]  = u_curr[t_ofs];
+        }
+        __syncthreads();
+        T* w;
+        if ( std::is_same<T,float>::value)
+            w = (T*)d_w32;
+        if ( std::is_same<T,double>::value)
+            w = (T*)d_w64;
+        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+        T deriv_x = 0.0;
+        //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
+        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+            deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i][j+k];
+        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+        T deriv_y = 0.0;
+        //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
+        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+            deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+k][j];
+        T val = (deriv_x + deriv_y) - (1.0 - a*dt) * u_prev[ofs] + (2.0 - a*dt)*s_curr[i][j];
+        if ( (ofs_x == WAVE_8_HALO_SIZE) or (ofs_y == WAVE_8_HALO_SIZE) or
+             (ofs_x == maxcol+WAVE_8_HALO_SIZE-1) or (ofs_y == maxrow+WAVE_8_HALO_SIZE-1)
+           )
+            val = 0;
+        u_next[ofs] = val; 
+    }
+}
+template<typename T>
+double solve_cuda_aux(wave_equation_context<T>& wec)
+{
+    /* Get device properties */
+    cudaDeviceProp prop;
+    checkCuda( cudaGetDeviceProperties(&prop, 0) );
+    printf("Device: %s (Compute Capability: %d.%d)\n",
+        prop.name, prop.major, prop.minor);
+    /**** Initialize constants ****/
+    T w0 = -205.0/72.0;
+    T w1 = 8.0/5.0;
+    T w2 = -1.0/5.0;
+    T w3 = 8.0/315.0;
+    T w4 = -1.0/560.0;
+    T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    if (std::is_same<T,float>::value)
+        checkCuda( cudaMemcpyToSymbol(d_w32, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
+    if (std::is_same<T,double>::value)
+        checkCuda( cudaMemcpyToSymbol(d_w64, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
+    /* Allocate memory on device */
+    T* u_prev;
+    T* u_curr;
+    T* u_next;
+    T* u_temp;
+    checkCuda( cudaMalloc((void**)&u_prev, wec.g_prev.size()*sizeof(T)) );
+    checkCuda( cudaMalloc((void**)&u_curr, wec.g_curr.size()*sizeof(T)) );
+    checkCuda( cudaMalloc((void**)&u_next, wec.g_next.size()*sizeof(T)) );
+    /* Copy grids on device */
+    checkCuda( cudaMemcpy(u_prev, wec.g_prev.data(), wec.g_prev.size()*sizeof(T), cudaMemcpyHostToDevice) );
+    checkCuda( cudaMemcpy(u_curr, wec.g_curr.data(), wec.g_curr.size()*sizeof(T), cudaMemcpyHostToDevice) );
+    checkCuda( cudaMemcpy(u_next, wec.g_next.data(), wec.g_next.size()*sizeof(T), cudaMemcpyHostToDevice) );
+    /* Launch parameters */
+    dim3 grid_size(wec.g_curr.domain_cols()/WAVE_8_KER_COLS + 1, wec.g_curr.domain_rows()/WAVE_8_KER_ROWS + 1);
+    dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
+    /* Simulation parameters */
+    wave_2D_params<T> params;
+    params.maxcol = wec.g_curr.domain_cols();
+    params.maxrow = wec.g_curr.domain_rows();
+    params.dt = wec.dt;
+    params.velocity = wec.velocity;
+    params.damping = wec.damping;
+    /* Prepare for timing */
+    cudaEvent_t startEvent, stopEvent;
+    checkCuda( cudaEventCreate(&startEvent) );
+    checkCuda( cudaEventCreate(&stopEvent) );
+    checkCuda( cudaEventRecord(startEvent, 0) );
+    for (size_t i = 0; i < wec.maxiter; i++)
+    {
+        wave_2D_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params);
+        u_temp = u_prev;
+        u_prev = u_curr;
+        u_curr = u_next;
+        u_next = u_temp;
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (i%100) == 0 )
+        {
+            checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) );
+            std::stringstream ss;
+            ss << "wave_cuda_" << i << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+    }
+    float milliseconds;
+    checkCuda( cudaEventRecord(stopEvent, 0) );
+    checkCuda( cudaEventSynchronize(stopEvent) );
+    checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
+    std::cout << "[Wave][Cuda] Iteration Time: " << milliseconds/wec.maxiter << "ms" << std::endl;
+    std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl;
+#ifdef HAVE_SILO
+    checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) );
+    visit_dump(wec.g_curr, "wave_cuda_lastiter.silo");
+#endif /* HAVE_SILO */
+    return milliseconds/wec.maxiter;
+}
+double solve_cuda(wave_equation_context<float>& wec)
+{
+    return solve_cuda_aux(wec);
+}
+double solve_cuda(wave_equation_context<double>& wec)
+{
+    return solve_cuda_aux(wec);
+}
--- a/kokkos-testing/fd_catalog/fd_wave_cuda.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cuda.hpp
+/* Performance evaluation of different implementations of finite difference
+ * schemes.
+ *
+ * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
+ */
+#pragma once
+double solve_cuda(wave_equation_context<float>&);
+double solve_cuda(wave_equation_context<double>&);
\ No newline at end of file
--- a/kokkos-testing/finite_difference_baseline.cpp
+++ b/kokkos-testing/finite_difference_baseline.cpp
@@ -85,7 +85,7 @@ visit_dump(const fd_grid<double>& g, const std::string& fn)
    return 0;
 }
-//#define HIGH_ORDER
+#define HIGH_ORDER
 #ifdef HIGH_ORDER
    #define HALO_SIZE 4
@@ -97,16 +97,16 @@ int main( int argc, char* argv[] )
 {
    using namespace std::chrono;
-    int     Nx          = 1024;
+    int     Nx          = 500;
-    int     Ny          = 1024;
+    int     Ny          = 500;
    int     halo        = HALO_SIZE;
    int     rows        = Ny + 2*halo;
    int     cols        = Nx + 2*halo;
-    int     timesteps   = 500;
+    int     timesteps   = 5000;
    double  dx          = 1./(Nx-1);
    double  dy          = 1./(Ny-1);
-    double  dt          = 0.0001;
+    double  dt          = 0.001;
    double  velocity    = 1;
    double  damping     = 0.1;

--- a/kokkos-testing/finite_differences.cu
+++ b/kokkos-testing/finite_differences.cu
 #include <iostream>
 #include <fstream>
 #include <vector>
-#include <cstdio>
-#include <cassert>
-#include <silo.h>
-#define DEBUG
-#define SINGLE_PRECISION
-template<typename T>
-class fd_grid
-{
-    std::vector<T>  m_data;
-    size_t m_rows, m_cols;
-public:
-    fd_grid()
-    {}
-    fd_grid(size_t rows, size_t cols) :
-        m_data(rows*cols), m_rows(rows), m_cols(cols)
-    {}
-    fd_grid(fd_grid&& other)
-    {
-        std::swap(m_data, other.m_data);
-    }
-    fd_grid& operator=(fd_grid&& other)
-    {
-        std::swap(m_data, other.m_data);
-        return *this;
-    }
-    T operator()(size_t i, size_t j) const
-    {
-        return m_data[i*m_cols + j];
-    }
-    T& operator()(size_t i, size_t j)
-    {
-        return m_data[i*m_cols + j];
-    }
-    size_t rows() const { return m_rows; }
-    size_t cols() const { return m_cols; }
-    T* data() { return m_data.data(); }
-    const T* data() const { return m_data.data(); }
-};
-template<typename T>
-int
-visit_dump(const fd_grid<T>& g, const std::string& fn)
-{
-    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
-    DBfile *db = nullptr;
-    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
-    if (!db)
-    {
-        std::cout << "Cannot write simulation output" << std::endl;
-        return -1;
-    }
-    auto size_x = g.cols();
-    auto size_y = g.rows();
-    std::vector<T> x(size_x);
-    std::vector<T> y(size_y);
-    for (size_t i = 0; i < size_x; i++)
-        x.at(i) = T(i)/(size_x-1);
-    for (size_t i = 0; i < size_y; i++)
-        y.at(i) = T(i)/(size_y-1);
-    int dims[] = { int(size_y), int(size_y) };
-    int ndims = 2;
-    T *coords[] = {x.data(), y.data()};
-    if (std::is_same<T,float>::value)
-    {
-        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_FLOAT, DB_COLLINEAR, NULL);    
-        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
-    }
-    else
-    {
-        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims, DB_DOUBLE, DB_COLLINEAR, NULL);    
-        DBPutQuadvar1(db, "solution", "mesh", g.data(), dims, ndims, NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
-    }
-    DBClose(db);
-    return 0;
-}
-inline cudaError_t checkCuda(cudaError_t result)
-{
-#if defined(DEBUG) || defined(_DEBUG)
-  if (result != cudaSuccess) {
-    fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
-    assert(result == cudaSuccess);
-  }
-#endif
-  return result;
-}
-#ifdef SINGLE_PRECISION
-__constant__ float d_w[9];
-#else
-__constant__ double d_w[9];
-#endif
-template<typename T>
+#include <sstream>
-__global__ void test_kernel(T* data)
+#include <cstdio>
-{
-    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
-    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
-    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
-    data[ofs] = gridDim.x * blockIdx.y + blockIdx.x;
-}
-#define     GRID_ROWS       32
-#define     GRID_COLS       32
-#define     HALO_SIZE       4
-int grid_rows;
-int grid_cols;
-template<typename T>
-__global__ void x_kernel(/*T* u_prev, */T* u_curr, T* u_next)
-{
-    __shared__ T s_curr[GRID_COLS+2*HALO_SIZE];
-    /* Get X/Y coordinates of this grid point */
-    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
-    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
-    /* Get offset of this grid point in the fd grid array */
-    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
-    int shm_ofs = threadIdx.x + HALO_SIZE;
-    s_curr[shm_ofs] = u_curr[ofs];
-    __syncthreads();
-    if (threadIdx.x < 4)
-    {
-        s_curr[threadIdx.x] = 0.0;
-        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
-    }
-    __syncthreads();
-    T dx = 0.0;
-    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
-        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
-    u_next[ofs] = dx;
-}
-#if 0
-template<typename T>
-__global__ void y_kernel(/*T* u_prev, */T* u_curr, T* u_next)
-{
-    __shared__ T s_curr[2*HALO_SIZE][32];
-    /* Get X/Y coordinates of this grid point */
-    int ofs_x = blockDim.x * blockIdx.x + threadIdx.x;
-    int ofs_y = blockDim.y * blockIdx.y + threadIdx.y;
-    /* Get offset of this grid point in the fd grid array */
-    int ofs = gridDim.x * ofs_y * blockDim.x + ofs_x;
-    int shm_ofs_x = threadIdx.x;
-    int shm_ofs_y = threadIdx.y;
-    s_curr[shm_ofs_x][shm_ofs_y] = u_curr[ofs];
-    __syncthreads();
-    if (threadIdx.y < 4)
+#include "fd_grid.hpp"
-    {
+#include "fd_dataio.hpp"
-        s_curr[threadIdx.x] = 0.0;
+#include "fd_wave_cpu.hpp"
-        s_curr[HALO_SIZE+GRID_COLS + threadIdx.x] = 0.0;
-    }
-    __syncthreads();
-    T dx = 0.0;
-    for (int k = -HALO_SIZE; k <= HALO_SIZE; k++)
-        dx += d_w[k+HALO_SIZE]*s_curr[shm_ofs+k];
-    u_next[ofs] = dx;
-}
-#endif
 int main(void)
 {
 #ifdef SINGLE_PRECISION
    using T = float;
 #else
    using T = double;
 #endif
-    /**** Get device properties ****/
+    wave_equation_context<T> wec(512, 512, 1, 0.1, 0.0001, 5000);
-    cudaDeviceProp prop;
-    checkCuda( cudaGetDeviceProperties(&prop, 0) );
-    printf("Device Name: %s\n", prop.name);
-    printf("Compute Capability: %d.%d\n\n", prop.major, prop.minor);
-    /**** Initialize constants ****/
-    T w0 = -205.0/72.0;
-    T w1 = 8.0/5.0;
-    T w2 = -1.0/5.0;
-    T w3 = 8.0/315.0;
-    T w4 = -1.0/560.0;
-    T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
-    checkCuda( cudaMemcpyToSymbol(d_w, w, 9*sizeof(T), 0, cudaMemcpyHostToDevice) );
-    /**** Prepare for timing ****/
+#if 0
-    float milliseconds;
+#ifdef __NVCC__
-    cudaEvent_t startEvent, stopEvent;
+    wec.init();
-    checkCuda( cudaEventCreate(&startEvent) );
+    solve_cuda(wec);
-    checkCuda( cudaEventCreate(&stopEvent) );
+#endif /* __NVCC__ */
+#endif
-    /**** Prepare grid and copy to device ****/
-    fd_grid<T> g(GRID_ROWS, GRID_COLS);
-    for (size_t i = 0; i < GRID_ROWS; i++)
-        for (size_t j = 0; j < GRID_COLS; j++)
-            g(i,j) = std::sin(M_PI*T(i)/GRID_ROWS)*std::sin(M_PI*T(j)/GRID_COLS);
-            //g(i,j) = (T(j)/GRID_COLS)*(T(j)/GRID_COLS);
-    T* u_curr;
-    T* u_next;
-    checkCuda( cudaMalloc((void**)&u_curr, GRID_ROWS*GRID_COLS*sizeof(T)) );
-    checkCuda( cudaMalloc((void**)&u_next, GRID_ROWS*GRID_COLS*sizeof(T)) );
-    checkCuda( cudaMemcpy(u_curr, g.data(), GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyHostToDevice) );
-    dim3 gs(1,GRID_ROWS);
-    dim3 tpb(GRID_COLS,1);
-    checkCuda( cudaEventRecord(startEvent, 0) );
-    //for (size_t i = 0; i < 1; i++)
-        x_kernel<<<gs, tpb>>>(u_curr, u_next);
-    checkCuda( cudaEventRecord(stopEvent, 0) );
-    checkCuda( cudaEventSynchronize(stopEvent) );
-    checkCuda( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent) );
-    std::cout << "Kernel execution time: " << milliseconds << " ms." << std::endl;
-    checkCuda( cudaDeviceSynchronize() );
-    checkCuda( cudaMemcpy(g.data(), u_next, GRID_ROWS*GRID_COLS*sizeof(T), cudaMemcpyDeviceToHost) );
+    wec.init();
+    solve_sequential(wec);
-    checkCuda( cudaFree(u_next) );
+    wec.init();
-    checkCuda( cudaFree(u_curr) );
+    solve_sequential_blocked(wec);
-    visit_dump(g, "cuda.silo");
+    auto maxthreads = std::thread::hardware_concurrency();
-    return 0;
+    for (size_t i = 1; i <= maxthreads; i *= 2)
+    {
+        wec.init();
+        solve_multithread(wec, i);
+    }
 }