From 374afe267e0e184edf040acff6e7c39d36eb1108 Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Mon, 23 Mar 2020 16:49:00 +0100
Subject: [PATCH] Reorganized fd_catalog a bit.

---
 kokkos-testing/experimental_data/timings.plot |   4 +-
 kokkos-testing/experimental_data/timings2.txt |   3 -
 kokkos-testing/fd_catalog/CMakeLists.txt      |  27 +-
 kokkos-testing/fd_catalog/fd_grid.hpp         |  28 ++
 kokkos-testing/fd_catalog/fd_main.cpp         |  11 +-
 kokkos-testing/fd_catalog/fd_wave.hpp         |   5 +-
 kokkos-testing/fd_catalog/fd_wave_cpu.hpp     | 358 +++++++++++-------
 kokkos-testing/fd_catalog/fd_wave_cuda.cu     |  46 ++-
 .../fd_catalog/finite_difference_kokkos.cpp   | 244 ++++++++++++
 kokkos-testing/test_gemv.cpp                  |  37 ++
 kokkos-testing/test_mandel.cpp                | 135 +++++++
 kokkos-testing/test_sumbw.cpp                 |  87 +++++
 12 files changed, 825 insertions(+), 160 deletions(-)
 delete mode 100644 kokkos-testing/experimental_data/timings2.txt
 create mode 100644 kokkos-testing/fd_catalog/finite_difference_kokkos.cpp
 create mode 100644 kokkos-testing/test_gemv.cpp
 create mode 100644 kokkos-testing/test_mandel.cpp
 create mode 100644 kokkos-testing/test_sumbw.cpp

diff --git a/kokkos-testing/experimental_data/timings.plot b/kokkos-testing/experimental_data/timings.plot
index e59f1c5..3484c34 100644
--- a/kokkos-testing/experimental_data/timings.plot
+++ b/kokkos-testing/experimental_data/timings.plot
@@ -1,5 +1,5 @@
 set term postscript enhanced color
-set output 'timing.eps'
+set output 'timing-float.eps'
 
 set title '2D damped wave equation'
 
@@ -7,4 +7,4 @@ set style data histogram
 set style fill solid border -1
 set logscale y
 set grid ytics mytics
-plot 'timings-float.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col
+plot 'timings.txt' using 2:xtic(1) ti col, '' u 3 ti col, '' u 4 ti col, '' u 5 ti col, '' u 6 ti col, '' u 7 ti col
diff --git a/kokkos-testing/experimental_data/timings2.txt b/kokkos-testing/experimental_data/timings2.txt
deleted file mode 100644
index f10583e..0000000
--- a/kokkos-testing/experimental_data/timings2.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-"order"     "Baseline"  "Serial"   "1 thread"   "2 threads"  "4 threads" "CUDA baseline"
-"2th order" 0.00707879  0.0141107  0.0138042   0.0072022  0.00539084
-"8th order" 0.033213    0.0419782  0.0409406   0.020956   0.0134292 0.0015
diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index 6efe136..a6f85c5 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -1,11 +1,24 @@
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
 project(fd_catalog)
 include(CheckLanguage)
+include(FetchContent)
 
-
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+option(ENABLE_KOKKOS "Enable Kokkos" ON)
+if (ENABLE_KOKKOS)
+    FetchContent_Declare(kokkos
+        GIT_REPOSITORY https://github.com/kokkos/kokkos.git
+    )
+    FetchContent_MakeAvailable(kokkos)
+
+    add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)
+    target_link_libraries(finite_difference_kokkos Kokkos::kokkos siloh5)
+    #add_definitions(-DHAVE_KOKKOS)
+    #set(HAVE_KOKKOS TRUE)
+endif()
+
 option(ENABLE_CUDA "Enable CUDA if present" ON)
 if (ENABLE_CUDA)
     check_language(CUDA)
@@ -20,9 +33,15 @@ if (ENABLE_CUDA)
 endif()
 
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG")
 set(CMAKE_CXX_FLAGS_RELEASEASSERT "-O3 -g -fpermissive")
 
+option(A_SAVE_TS "Save timesteps" OFF)
+if (A_SAVE_TS)
+    add_definitions(-DHAVE_SILO)
+    add_definitions(-DSAVE_TIMESTEPS)
+endif()
+
 option(ENABLE_SINGLE "Enable single precision build" ON)
 if (ENABLE_SINGLE)
     if (HAVE_CUDA)
@@ -34,6 +53,7 @@ if (ENABLE_SINGLE)
         add_executable(fd_catalog_single fd_main.cpp)
     endif()
     target_compile_definitions(fd_catalog_single PUBLIC -DSINGLE_PRECISION)
+    target_link_libraries(fd_catalog_single siloh5)
 endif()
 
 option(ENABLE_DOUBLE "Enable double precision build" ON)
@@ -46,4 +66,5 @@ if (ENABLE_DOUBLE)
     else()
         add_executable(fd_catalog_double fd_main.cpp)
     endif()
+    target_link_libraries(fd_catalog_double siloh5)
 endif()
diff --git a/kokkos-testing/fd_catalog/fd_grid.hpp b/kokkos-testing/fd_catalog/fd_grid.hpp
index 4199541..6aae1db 100644
--- a/kokkos-testing/fd_catalog/fd_grid.hpp
+++ b/kokkos-testing/fd_catalog/fd_grid.hpp
@@ -105,6 +105,34 @@ public:
     T*          data()          { return m_data.data(); }
     const T*    data() const    { return m_data.data(); }
 
+    T* data(Int i, Int j)
+    {
+        auto ofs_row        = i + m_halo;
+        assert(ofs_row < m_domain_rows + 2*m_halo);
+
+        auto ofs_col        = j + m_halo;
+        assert(ofs_col < m_domain_cols + 2*m_halo);
+
+        auto grid_cols      = m_domain_cols+2*m_halo;
+        auto ofs            = ofs_row * grid_cols + ofs_col;
+        assert( (ofs >= 0) and (ofs < m_data.size()) );
+        return m_data.data() + ofs;
+    }
+
+    const T* data(Int i, Int j) const
+    {
+        auto ofs_row        = i + m_halo;
+        assert(ofs_row < m_domain_rows + 2*m_halo);
+
+        auto ofs_col        = j + m_halo;
+        assert(ofs_col < m_domain_cols + 2*m_halo);
+
+        auto grid_cols      = m_domain_cols+2*m_halo;
+        auto ofs            = ofs_row * grid_cols + ofs_col;
+        assert( (ofs >= 0) and (ofs < m_data.size()) );
+        return m_data.data() + ofs;
+    }
+
     size_t      size() const    { return m_data.size(); }
 
     T           dx() const { return 1./(m_domain_cols-1); }
diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp
index a98258a..458d937 100644
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <fstream>
 #include <cstdio>
-
+#include <unistd.h>
 #include "fd_wave_cpu.hpp"
 
 #ifdef HAVE_CUDA
@@ -22,11 +22,11 @@ int main(void)
     ofs << "\"SIZE\"    \"Seq\"    \"SeqBlk\"    ";
 
 #ifdef HAVE_CUDA
-    ofs << "Cuda    ";
+    ofs << "\"Cuda\"    ";
 #endif
 
     auto maxthreads = std::thread::hardware_concurrency();
-    for (size_t threads = 1; threads <= maxthreads; threads *= 2)
+    for (size_t threads = 1; threads < maxthreads; threads *= 2)
         ofs << "\"" << threads << " threads\"    ";
     ofs << std::endl;
 
@@ -34,8 +34,7 @@ int main(void)
 
     for (size_t sz = 128; sz <= 1024; sz *= 2)
     {
-        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 500);
-
+        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
         ofs << sz << "    ";
 
         wec.init();
@@ -54,7 +53,7 @@ int main(void)
 
         auto maxthreads = std::thread::hardware_concurrency();
 
-        for (size_t threads = 1; threads <= maxthreads; threads *= 2)
+        for (size_t threads = 1; threads < maxthreads; threads *= 2)
         {
             wec.init();
             time = solve_multithread(wec, threads);
diff --git a/kokkos-testing/fd_catalog/fd_wave.hpp b/kokkos-testing/fd_catalog/fd_wave.hpp
index b58c88c..a57b557 100644
--- a/kokkos-testing/fd_catalog/fd_wave.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave.hpp
@@ -30,7 +30,10 @@ struct wave_equation_context
     T       dt;
     int     maxiter;
 
-    wave_equation_context(size_t rows, size_t cols, T vel, T damp, T pdt, int pmaxiter)
+    size_t  rows, cols;
+
+    wave_equation_context(size_t prows, size_t pcols, T vel, T damp, T pdt, int pmaxiter)
+        : rows(prows), cols(pcols)
     {
         g_prev = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
         g_curr = fd_grid<T>(rows, cols, WAVE_8_HALO_SIZE);
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index ae9c2b0..c802f0f 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -10,7 +10,10 @@
 #include <cassert>
 #include <list>
 #include <cmath>
-#include <functional>
+#include <algorithm>
+#include <sstream>
+
+#include <string.h>
 
 #include "fd_grid.hpp"
 #include "fd_dataio.hpp"
@@ -37,14 +40,18 @@ operator<<(std::ostream& os, const tile& tl)
     return os;
 }
 
-/* This is the naive implementation of a wave equation kernel. You can specify
- * a tile to compute only on a subdomain. */
+#define BLK_ROWS 56
+#define BLK_COLS 56
+
 template<typename T>
 void
-wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
-                     fd_grid<T>& g_next,
-                     const wave_2D_params<T>& params, tile tl)
+wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
+               fd_grid<T>& g_next,
+               const wave_2D_params<T>& params)
 {
+    /* 1) memcpy is faster than for
+     * 2) if you put inner block processing, you kill perf
+     */
     int maxrow  = params.maxrow;
     int maxcol  = params.maxcol;
     T   dt      = params.dt;
@@ -62,56 +69,188 @@ wave_2D_kernel_tiled(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     static const T w4 = -1.0/560.0;
     static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
 
-    for (size_t i = 0; i < tl.dim_y; i++)
+    T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
+
+    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T one_minus_adt = (1.0 - a*dt);
+    T two_minus_adt = (2.0 - a*dt);
+
+    size_t blocks_x = maxcol / BLK_COLS;
+    size_t blocks_y = maxrow / BLK_ROWS;
+    size_t rem_x = maxcol % BLK_COLS;
+    size_t rem_y = maxrow % BLK_ROWS;
+
+
+    /* Do fixed-size blocks */
+    for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
+    {
+        for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
+        {
+            /* Here all the dimensions are known at compile time, let the
+             * compiler do its job. */
+            for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
+            {
+                int ofs_x = bj - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
+                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+            }
+
+            for (size_t i = 0; i < BLK_ROWS; i++)
+            {
+                for (size_t j = 0; j < BLK_COLS; j++)
+                {
+                    T lapl = 0.0;
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+
+                    T val = lapl -
+                            one_minus_adt * g_prev(bi+i, bj+j) +
+                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                    
+                    assert(j < BLK_COLS);
+                    g_next(bi+i, bj+j) = val;
+                }
+            }
+        }
+    }
+
+    /* Do upper blocks */
+    {
+        size_t bi = blocks_y*BLK_ROWS;
+        for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
+        {
+            for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
+            {
+                
+                int ofs_x = bj - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
+                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+            }
+
+            for (size_t i = 0; i < rem_y; i++)
+            {
+                for (size_t j = 0; j < BLK_COLS; j++)
+                {
+                    T lapl = 0.0;
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+
+                    T val = lapl -
+                            one_minus_adt * g_prev(bi+i, bj+j) +
+                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                    
+                    assert(j < BLK_COLS);
+                    g_next(bi+i, bj+j) = val;
+                }
+            }
+        }
+    }
+
+    /* Do upper leftmost corner */
     {
-        for (size_t j = 0; j < tl.dim_x; j++)
+        size_t bi = blocks_y*BLK_ROWS;
+        size_t bj = blocks_x*BLK_COLS;
+
+        for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
         {
-            /* Get X/Y coordinates of this grid point */
-            int ofs_x = tl.dim_x * tl.idx_x + j;
-            int ofs_y = tl.dim_y * tl.idx_y + i;
+            
+            int ofs_x = bj - WAVE_8_HALO_SIZE;
+            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+        }
 
-            if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
+        for (size_t i = 0; i < rem_y; i++)
+        {
+            for (size_t j = 0; j < rem_x; j++)
             {
-                T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-                T deriv_x = 0.0;
-                //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
+                T lapl = 0.0;
                 for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y,ofs_x+k);
+                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
         
-                T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
-                T deriv_y = 0.0;
-                //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
                 for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(ofs_y+k,ofs_x);
+                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+
+                T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                
+                assert(j < BLK_COLS);
+                g_next(bi+i, bj+j) = val;
+            }
+        }
+    }
+
+    /* Do leftmost blocks */
+    for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
+    {
+        size_t bj = blocks_x*BLK_COLS;
 
-                T val = (deriv_x + deriv_y) - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) + (2.0 - a*dt)*g_curr(ofs_y, ofs_x);
-                if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
-                    val = 0;
+        for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
+        {
+            
+            int ofs_x = bj - WAVE_8_HALO_SIZE;
+            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
+        }
 
-                g_next(ofs_y, ofs_x) = val;
+        for (size_t i = 0; i < BLK_ROWS; i++)
+        {
+            for (size_t j = 0; j < rem_x; j++)
+            {
+                T lapl = 0.0;
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+        
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+
+                T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                
+                assert(j < BLK_COLS);
+                g_next(bi+i, bj+j) = val;
             }
         }
     }
-}
 
-/* This is a "blocked" kernel. It copies a block on a "private" chunk of
- * memory, computes there and stores the result back. This approach looks
- * slightly faster than the "naive" kernel. */
+    /* Apply BCs */
+    for (size_t j = 0; j < maxcol; j++)
+    {
+        g_next(0,j) = 0;
+        g_next(maxrow-1, j) = 0;
+    }
 
-#define WAVE_8_CPUKER_COLS  56
-#define WAVE_8_CPUKER_ROWS  56
+    for (size_t i = 0; i < maxrow; i++)
+    {
+        g_next(i, 0) = 0;
+        g_next(i, maxcol-1) = 0;
+    }
+}
 
 template<typename T>
 void
-wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
-                     fd_grid<T>& g_next,
-                     const wave_2D_params<T>& params, tile tl)
+wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
+               fd_grid<T>& g_next,
+               const wave_2D_params<T>& params,
+               size_t thread_id = 0, size_t num_threads = 1)
 {
-    const int maxrow  = params.maxrow;
-    const int maxcol  = params.maxcol;
-    const T   dt      = params.dt;
-    const T   c       = params.velocity;
-    const T   a       = params.damping;
+    int maxrow  = params.maxrow;
+    int maxcol  = params.maxcol;
+    T   dt      = params.dt;
+    T   c       = params.velocity;
+    T   a       = params.damping;
 
     assert(maxcol > 1);
     assert(maxrow > 1);
@@ -124,60 +263,28 @@ wave_2D_kernel_tiled_blocked(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     static const T w4 = -1.0/560.0;
     static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
 
-    static const size_t MX_SZ = WAVE_8_CPUKER_COLS+2*WAVE_8_HALO_SIZE;
-    static const size_t MY_SZ = WAVE_8_CPUKER_ROWS+2*WAVE_8_HALO_SIZE;
-
-    T s_curr[MX_SZ][MY_SZ];
+    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T one_minus_adt = (1.0 - a*dt);
+    T two_minus_adt = (2.0 - a*dt);
 
-    for (size_t bi = 0; bi < tl.dim_y; bi += WAVE_8_CPUKER_ROWS)
+    for (size_t i = thread_id; i < maxrow; i += num_threads)
     {
-        for (size_t bj = 0; bj < tl.dim_x; bj += WAVE_8_CPUKER_COLS)
+        for (size_t j = 0; j < maxcol; j++)
         {
-            /* Load working tile */
-            for (size_t i = 0; i < MY_SZ; i++)
-            {
-                for (size_t j = 0; j < MX_SZ; j++)
-                {
-                    int ofs_x = tl.idx_x * tl.dim_x + bj + j;
-                    int ofs_y = tl.idx_y * tl.dim_y + bi + i;
-
-                    if ( (ofs_x < (maxcol+WAVE_8_HALO_SIZE)) and (ofs_y < (maxrow+WAVE_8_HALO_SIZE)) )
-                        s_curr[i][j] = g_curr(ofs_y-WAVE_8_HALO_SIZE, ofs_x-WAVE_8_HALO_SIZE);
-                }
-            }
-
-            /* Do finite differences */
-            for (size_t i = 0; i < WAVE_8_CPUKER_ROWS; i++)
-            {
-                for (size_t j = 0; j < WAVE_8_CPUKER_COLS; j++)
-                {
-                    /* Get X/Y coordinates of this grid point */
-                    int ofs_x = tl.idx_x * tl.dim_x + bj + j;
-                    int ofs_y = tl.idx_y * tl.dim_y + bi + i;
-
-                    if ( (ofs_x < maxcol) and (ofs_y < maxrow) )
-                    {
-                        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-                        T deriv_x = 0.0;
-                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                            deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
-        
-                        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
-                        T deriv_y = 0.0;
-                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                            deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
-
-                        T val = (deriv_x + deriv_y)
-                              - (1.0 - a*dt) * g_prev(ofs_y, ofs_x) 
-                              + (2.0 - a*dt) * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+            T lapl = 0.0;
+            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i,j+k);
+    
+            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);
 
-                        if ( (ofs_x == 0) or (ofs_y == 0) or (ofs_x == maxcol-1) or (ofs_y == maxrow-1) )
-                            val = 0;
+            T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
 
-                        g_next(ofs_y, ofs_x) = val;
-                    }
-                }
-            }
+            if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
+                val = 0;
+            
+            g_next(i, j) = val;
         }
     }
 }
@@ -202,16 +309,16 @@ solve_sequential_aux(wave_equation_context<T>& wec)
 
     double time = 0.0;
 
-    std::ofstream ofs("itertime.txt");
+    //std::ofstream ofs("itertime.txt");
 
     for (size_t i = 0; i < wec.maxiter; i++)
     {
         auto start = std::chrono::high_resolution_clock::now();
 
         if (blocked)
-            wave_2D_kernel_tiled_blocked(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
+            wave_2D_kernel_blk(wec.g_prev, wec.g_curr, wec.g_next, params);
         else
-            wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
+            wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params);
 
         auto stop = std::chrono::high_resolution_clock::now();
 
@@ -226,26 +333,37 @@ solve_sequential_aux(wave_equation_context<T>& wec)
         if ( (i%100) == 0 )
         {
             std::stringstream ss;
-            ss << "wave_tiled_" << i << ".silo";
+            ss << "wave_seq_" << i << ".silo";
             visit_dump(wec.g_curr, ss.str());
         }
 #endif /* SAVE_TIMESTEPS */
 #endif /* HAVE_SILO */
 
-        ofs << i << " " << ms.count() << std::endl;
+        //ofs << i << " " << ms.count() << std::endl;
     }
     
     if (blocked)
     {
         std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
         std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
+
+        double itertime = time/wec.maxiter;
+        double gflops_s = 60*(params.maxrow*params.maxcol)/(1e6*itertime);
+        std::cout << "[Wave][SeqBlk] GFlops/s: " << gflops_s << std::endl;
     }
     else
     {
         std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
         std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
-    }
+        
+        double itertime = time/wec.maxiter;
+        double gflops_s = 60*(params.maxrow*params.maxcol)/(1e6*itertime);
+        std::cout << "[Wave][Sequential] GFlops/s: " << gflops_s << std::endl;
 
+        size_t kernel_bytes = 3*sizeof(T)*(params.maxrow*params.maxcol);
+        double gbytes_s = kernel_bytes/(1e6*itertime);
+        std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
+    }
 #ifdef HAVE_SILO
     visit_dump(wec.g_curr, "wave_sequential_lastiter.silo");
 #endif /* HAVE_SILO */
@@ -276,95 +394,57 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
     params.velocity = wec.velocity;
     params.damping = wec.damping;
 
-    /* Tile sizes */
-    size_t tile_size_x = 32;
-    size_t tile_size_y = 32;
-    size_t num_tiles_x = params.maxcol/tile_size_x + 1;
-    size_t num_tiles_y = params.maxrow/tile_size_y + 1;
-
 
     /* Multithreading stuff */
-    std::mutex                  cout_mtx;
     std::mutex                  cv_mtx;
     std::condition_variable     prod_cv;
     std::condition_variable     cons_cv;
-    std::list<tile>             tiles;
-    size_t                      running_threads;
-    bool                        iteration_finished = false;
     std::vector<bool>           thread_done(nths);
+    bool                        iteration_finished = false;
 
-    for (size_t i = 0; i < nths; i++)
-        thread_done[i] = true;
-
-    auto thread_lambda = [&](int tid) {
+    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
         while (1)
         {
             /* Wait for the producer to notify that there's something to do */
             {
                 std::unique_lock<std::mutex> lck(cv_mtx);
-                while ( thread_done[tid] )
+                while ( thread_done[thread_id] )
                     cons_cv.wait(lck);
 
                 if (iteration_finished)
-                {
-                    assert(tiles.size() == 0);
                     return;
-                }
             }
 
-            /* Fetch work and process data */
-            while (1)
-            {
-                tile tl;
-
-                {
-                    std::unique_lock<std::mutex> lck(cv_mtx);
-                    if (tiles.size() == 0)
-                        break;
-
-                    assert(tiles.size() != 0);
-                    tl = tiles.front();
-                    tiles.pop_front();
-                }
-
-                wave_2D_kernel_tiled(wec.g_prev, wec.g_curr, wec.g_next, params, tl);
-            }
+            /* Do the timestep */
+            wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);
 
             /* Work for this thread finished, notify producer */
             std::unique_lock<std::mutex> lck(cv_mtx);
-            //assert(running_threads > 0);
-            running_threads--;
             prod_cv.notify_one();
-            thread_done[tid] = true;
+            thread_done[thread_id] = true;
         }
     };
 
 
     std::vector<std::thread> threads(nths);
     for (size_t i = 0; i < nths; i++)
-        threads[i] = std::thread(thread_lambda, i);
+        threads[i] = std::thread(thread_lambda, i, nths);
 
     double time = 0.0;
 
     for (size_t iter = 0; iter < wec.maxiter; iter++)
     {
         auto start = std::chrono::high_resolution_clock::now();
-        std::unique_lock<std::mutex> lck(cv_mtx);
-        for (size_t i = 0; i < num_tiles_y; i++)
-            for (size_t j = 0; j < num_tiles_x; j++)
-                tiles.push_back( tile(tile_size_x, tile_size_y, j, i) );
 
+        std::unique_lock<std::mutex> lck(cv_mtx);
         /* Mark data ready and start the threads */
         for (size_t i = 0; i < nths; i++)
             thread_done[i] = false;
-        
         cons_cv.notify_all();
 
         while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
             prod_cv.wait(lck);
 
-        assert(tiles.size() == 0);
-
         auto stop = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double, std::milli> ms = stop - start;
         time += ms.count();
diff --git a/kokkos-testing/fd_catalog/fd_wave_cuda.cu b/kokkos-testing/fd_catalog/fd_wave_cuda.cu
index b0edcb4..4ccf0b4 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cuda.cu
+++ b/kokkos-testing/fd_catalog/fd_wave_cuda.cu
@@ -6,6 +6,7 @@
 
 #include <iostream>
 #include <cassert>
+#include <sstream>
 
 #include "fd_grid.hpp"
 #include "fd_dataio.hpp"
@@ -37,6 +38,9 @@ template<typename T>
 __global__ void
 wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
 {
+    /* This kernel saturates the memory bandwidth on GTX650M. Memory bw
+     * was estimated using Mark Harris' copy/transpose code */
+
     //     HHHH
     //    H****H     H = halo, * = data
     //    H****H     We load a tile as in figure. This wastes bandwidth
@@ -44,6 +48,9 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
     //    H****H     the easiest way to remain generic w.r.t. grid size
     //     HHHH
 
+#ifdef USE_SHARED_PREV
+    __shared__ T s_prev[WAVE_8_KER_ROWS][WAVE_8_KER_COLS];
+#endif
     __shared__ T s_curr[WAVE_8_KER_ROWS+2*WAVE_8_HALO_SIZE][WAVE_8_KER_COLS+2*WAVE_8_HALO_SIZE];
 
     int maxrow  = params.maxrow;
@@ -55,6 +62,14 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
     assert(maxcol > 1);
     assert(maxrow > 1);
 
+    T c2dt2 = c*c*dt*dt;
+    T kx2 = c2dt2 * (maxcol-1)*(maxcol-1);
+    T ky2 = c2dt2 * (maxrow-1)*(maxrow-1);
+    /* Don't ask me why but if you put the rhs of this in the computation
+     * of val the kernel runs 2x slower */
+    T one_minus_adt = (1.0 - a*dt);
+    T two_minus_adt = (2.0 - a*dt);
+
     /* Get X/Y coordinates of this grid point */
     int ofs_x = blockDim.x * blockIdx.x + threadIdx.x + WAVE_8_HALO_SIZE;
     int ofs_y = blockDim.y * blockIdx.y + threadIdx.y + WAVE_8_HALO_SIZE;
@@ -69,6 +84,9 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
 
         s_curr[i][j] = u_curr[ofs];
 
+#ifdef USE_SHARED_PREV
+        s_prev[threadIdx.y][threadIdx.x] = u_prev[ofs];
+#endif
         if (threadIdx.x < WAVE_8_HALO_SIZE)
         {
             /* X-halo */
@@ -95,25 +113,32 @@ wave_2D_kernel_cuda(T* u_prev, T* u_curr, T* u_next, wave_2D_params<T> params)
         if ( std::is_same<T,double>::value)
             w = (T*)d_w64;
 
-        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
         T deriv_x = 0.0;
         //deriv_x = kx2 * (s_curr[i][j-1] - 2.0*s_curr[i][j] + s_curr[i][j+1]);
         for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
             deriv_x += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i][j+k];
-        
-        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+
         T deriv_y = 0.0;
         //deriv_y = ky2 * (s_curr[i-1][j] - 2.0*s_curr[i][j] + s_curr[i+1][j]);
         for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
             deriv_y += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+k][j];
 
-        T val = (deriv_x + deriv_y) - (1.0 - a*dt) * u_prev[ofs] + (2.0 - a*dt)*s_curr[i][j];
+#ifdef USE_SHARED_PREV
+        T val = (deriv_x + deriv_y)
+              - one_minus_adt * s_prev[threadIdx.y][threadIdx.x]
+              + two_minus_adt * s_curr[i][j];
+#else 
+        T val = (deriv_x + deriv_y)
+              - one_minus_adt * u_prev[ofs]
+              + two_minus_adt * s_curr[i][j];
+#endif
+
         if ( (ofs_x == WAVE_8_HALO_SIZE) or (ofs_y == WAVE_8_HALO_SIZE) or
              (ofs_x == maxcol+WAVE_8_HALO_SIZE-1) or (ofs_y == maxrow+WAVE_8_HALO_SIZE-1)
            )
             val = 0;
 
-        u_next[ofs] = val; 
+        u_next[ofs] = val;
     }
 }
 
@@ -175,6 +200,7 @@ double solve_cuda_aux(wave_equation_context<T>& wec)
 
     for (size_t i = 0; i < wec.maxiter; i++)
     {
+        //sum_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params);
         wave_2D_kernel_cuda<<<grid_size, threads_per_block>>>(u_prev, u_curr, u_next, params);
         
 
@@ -204,13 +230,21 @@ double solve_cuda_aux(wave_equation_context<T>& wec)
 
     std::cout << "[Wave][Cuda] Iteration Time: " << milliseconds/wec.maxiter << "ms" << std::endl;
     std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl;
+    
+    double itertime = milliseconds/wec.maxiter;
+    double gflops_s = 70*(params.maxrow*params.maxcol)/(1e6*itertime);
+    std::cout << "[Wave][Cuda] GFlops/s: " << gflops_s << std::endl;
+
+    size_t kernel_bytes = (40*40+2*32*32)*sizeof(T)*grid_size.x*grid_size.y;
+    double gbytes_s = kernel_bytes/(1e6*itertime);
+    std::cout << "[Wave][Cuda] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
 
 #ifdef HAVE_SILO
     checkCuda( cudaMemcpy(wec.g_curr.data(), u_curr, wec.g_curr.size()*sizeof(T), cudaMemcpyDeviceToHost) );
     visit_dump(wec.g_curr, "wave_cuda_lastiter.silo");
 #endif /* HAVE_SILO */
 
-    return milliseconds/wec.maxiter;
+    return itertime;
 }
 
 double solve_cuda(wave_equation_context<float>& wec)
diff --git a/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp b/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp
new file mode 100644
index 0000000..9de8a2b
--- /dev/null
+++ b/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp
@@ -0,0 +1,244 @@
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <cmath>
+#include <chrono>
+#include <type_traits>
+
+#include <silo.h>
+#include <Kokkos_Core.hpp>
+
+#define WAVE_8_HALO_SIZE 4
+
+using namespace Kokkos;
+using namespace std::chrono;
+
+template<typename T>
+int
+visit_dump(const Kokkos::View<T**>& kv, const std::string& fn)
+{
+    static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value,
+                  "Only double or float");
+
+    DBfile *db = nullptr;
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "Kokkos test", DB_HDF5);
+    if (!db)
+    {
+        std::cout << "Cannot write simulation output" << std::endl;
+        return -1;
+    }
+
+    auto size_x = kv.extent(0);
+    auto size_y = kv.extent(1);
+    
+    std::vector<T> x(size_x);
+    std::vector<T> y(size_y);
+    
+    for (size_t i = 0; i < size_x; i++)
+        x.at(i) = T(i)/(size_x-1);
+        
+    for (size_t i = 0; i < size_y; i++)
+        y.at(i) = T(i)/(size_y-1);
+        
+    int dims[] = { int(size_y), int(size_y) };
+    int ndims = 2;
+    T *coords[] = { x.data(), y.data() };
+    
+    if (std::is_same<T,float>::value)
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
+                      DB_FLOAT, DB_COLLINEAR, NULL);
+
+    if (std::is_same<T,double>::value)
+        DBPutQuadmesh(db, "mesh", NULL, coords, dims, ndims,
+                      DB_DOUBLE, DB_COLLINEAR, NULL);
+    
+    std::vector<T> data(x.size() * y.size());
+    
+    for (size_t i = 0; i < x.size(); i++)
+        for (size_t j = 0; j < y.size(); j++)
+            data.at(i*y.size()+j) = kv(i,j);
+    
+    if (std::is_same<T,float>::value)
+        DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
+                      NULL, 0, DB_FLOAT, DB_NODECENT, NULL);
+    
+    if (std::is_same<T,double>::value)
+        DBPutQuadvar1(db, "solution", "mesh", data.data(), dims, ndims,
+                      NULL, 0, DB_DOUBLE, DB_NODECENT, NULL);
+
+    DBClose(db);
+    return 0;
+}
+
+template<typename T>
+struct wave_equation_context_kokkos
+{
+    View<T**>   g_prev;
+    View<T**>   g_curr;
+    View<T**>   g_next;
+
+    T       velocity;
+    T       damping;
+    T       dt;
+    int     maxiter;
+
+    size_t  rows, cols;
+    size_t  grows, gcols;
+
+
+    wave_equation_context_kokkos(size_t prows, size_t pcols,
+                                 T vel, T damp, T pdt, int pmaxiter)
+        : rows(prows), cols(pcols),
+          grows(prows+2*WAVE_8_HALO_SIZE), gcols(pcols+2*WAVE_8_HALO_SIZE)
+    {
+
+        g_prev = View<T**>("g_prev", grows, gcols);
+        g_curr = View<T**>("g_curr", grows, gcols);
+        g_next = View<T**>("g_next", grows, gcols);
+        velocity    = vel;
+        damping     = damp;
+        dt          = pdt;
+        maxiter     = pmaxiter;
+
+        init();
+    }
+
+    bool is_halo(size_t i, size_t j)
+    {
+        if (i < WAVE_8_HALO_SIZE)
+            return true;
+        if (j < WAVE_8_HALO_SIZE)
+            return true;
+        if (i >= rows+WAVE_8_HALO_SIZE)
+            return true;
+        if (j >= cols+WAVE_8_HALO_SIZE)
+            return true;
+        return false;
+    }
+
+    void init(void)
+    {
+        auto dx = 1./(cols-1);
+        auto dy = 1./(rows-1);
+
+        for (size_t i = 0; i < grows; i++)
+        {
+            for (size_t j = 0; j < gcols; j++)
+            {
+                if ( is_halo(i,j) )
+                {
+                    g_prev(i,j) = 0.0;
+                    g_curr(i,j) = 0.0;
+                    g_next(i,j) = 0.0; 
+                }
+                else
+                {
+                    T y = dy*i - 0.3;
+                    T x = dx*j - 0.1;
+                    g_prev(i,j) = -std::exp(-2400*(x*x + y*y));
+                    g_curr(i,j) = 2*dt*g_prev(i,j);
+                    g_next(i,j) = 0.0;
+                }
+            }
+        }
+    }
+};
+
+
+template<typename T>
+double solve_kokkos(wave_equation_context_kokkos<T>& wec)
+{
+    int maxrow  = wec.rows;
+    int maxcol  = wec.cols;
+    T   dt      = wec.dt;
+    T   c       = wec.velocity;
+    T   a       = wec.damping;
+
+    assert(maxcol > 1);
+    assert(maxrow > 1);
+
+    // specifying tiling explicitly worsens perf
+    MDRangePolicy<Rank<2>> range({0,0}, {maxrow-1, maxcol-1});
+
+    static const T w0 = -205.0/72.0;
+    static const T w1 =    8.0/5.0;
+    static const T w2 =   -1.0/5.0;
+    static const T w3 =    8.0/315.0;
+    static const T w4 =   -1.0/560.0;
+    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+
+    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T one_minus_adt = (1.0 - a*dt);
+    T two_minus_adt = (2.0 - a*dt);
+
+    double iter_time = 0.0;
+    for (int ts = 0; ts < wec.maxiter; ts++)
+    {
+        auto t_begin = high_resolution_clock::now();
+
+        parallel_for(range, KOKKOS_LAMBDA(int i, int j)
+        {
+            i += WAVE_8_HALO_SIZE;
+            j += WAVE_8_HALO_SIZE;
+
+            T lapl = 0.0;
+            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * wec.g_curr(i,j+k);
+
+            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * wec.g_curr(i+k,j);
+
+            T val = lapl - one_minus_adt * wec.g_prev(i, j) + two_minus_adt * wec.g_curr(i, j);
+
+            if ( (i == 0) or 
+                 (j == 0) or 
+                 (i == maxrow-1) or 
+                 (j == maxcol-1)
+               )
+                val = 0;
+        
+            wec.g_next(i, j) = val;
+        });
+
+        auto t_end = high_resolution_clock::now();
+        
+        std::swap(wec.g_prev, wec.g_curr);
+        std::swap(wec.g_curr, wec.g_next);
+
+        std::chrono::duration<double, std::milli> ms = t_end - t_begin;
+        iter_time += ms.count();
+#if 0
+        if ( (ts % 100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "wave_kokkos_" << ts << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif       
+    }
+
+    double avg_iter_time = iter_time/wec.maxiter;
+    std::cout << "Average iteration time: " << avg_iter_time << std::endl;
+    return avg_iter_time;
+}
+
+int main(int argc, char *argv[])
+{
+    using T = double;
+
+    Kokkos::initialize( argc, argv );
+
+    for (size_t sz = 128; sz <= 1024; sz *= 2)
+    {
+        wave_equation_context_kokkos<T> wec(sz, sz, 1, 0.1, 0.0001, 500);
+        wec.init();
+        solve_kokkos(wec);
+    }
+
+    Kokkos::finalize();
+
+    return 0;
+}
+
diff --git a/kokkos-testing/test_gemv.cpp b/kokkos-testing/test_gemv.cpp
new file mode 100644
index 0000000..0616c0b
--- /dev/null
+++ b/kokkos-testing/test_gemv.cpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <vector>
+#include <chrono>
+
+template<typename T>
+void gemv(T alpha, const T *A, const T *x, T beta, T *y, size_t M, size_t N)
+{
+    for (size_t i = 0; i < M; i++)
+        for (size_t j = 0; j < N; j++)
+            y[i] = alpha*A[i*N+j]*x[j] + beta*y[i];
+}
+
+int main(void)
+{
+    using T = double;
+
+    static const size_t loops = 10000;
+    static const size_t M = 1000;
+    static const size_t N = 1000;
+
+    std::vector<T> A(M*N);
+    std::vector<T> y(M), x(N);
+
+    T alpha = 0.1;
+    T beta = 0.1;
+
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < loops; i++)
+        gemv(alpha, A.data(), x.data(), beta, y.data(), M, N);
+    auto stop = std::chrono::high_resolution_clock::now();
+    
+    std::chrono::duration<double, std::milli> ms = stop - start;
+
+    std::cout << "GEMV time: " << ms.count()/loops << " ms"  << std::endl;   
+
+    return 0;
+}
diff --git a/kokkos-testing/test_mandel.cpp b/kokkos-testing/test_mandel.cpp
new file mode 100644
index 0000000..6365842
--- /dev/null
+++ b/kokkos-testing/test_mandel.cpp
@@ -0,0 +1,135 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <chrono>
+
+#define YMIN (-1.5)
+#define YMAX (1.5)
+#define XMIN (-2.5)
+#define XMAX (1.5)
+
+#define SIZE_Y	2560
+#define SIZE_X  2048
+#define GPU_BLOCK_X_SZ  32
+#define GPU_BLOCK_Y_SZ  32
+
+#define SY_TO_PLANE(y) (YMIN + y*(YMAX-YMIN)/SIZE_Y)
+#define SX_TO_PLANE(x) (XMIN + x*(XMAX-XMIN)/SIZE_X)
+
+#ifdef __NVCC__
+
+template<typename T>
+__global__
+void gpu_mandel(T *mandel, int maxiter)
+{
+    int j = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (j >= SIZE_X or i >= SIZE_Y)
+        return;
+
+	T z_re = 0;
+	T z_im = 0;
+	T c_re = SX_TO_PLANE(j);                    // 4 ops
+	T c_im = SY_TO_PLANE(i);                    // 4 ops
+
+			
+	T zabs2 = (z_re * z_re) + (z_im * z_im);    // 3 ops
+	int iter = 0;
+                                                // = 11 ops always
+	while (zabs2 < 4.0 && iter < maxiter)
+	{
+		T z_re2 = z_re*z_re;                    // 1 op
+		T z_im2 = z_im*z_im;                    // 1 op
+		T z_ri = 2*z_re*z_im;                   // 2 ops
+
+		z_re = z_re2 - z_im2 + c_re;            // 2 ops
+		z_im = z_ri + c_im;                     // 1 op
+
+		zabs2 = z_re2 + z_im2;                  // 1 op
+		iter++;
+	}
+                                                // = 8 ops * iter
+	if (iter == maxiter)
+		mandel[i*SIZE_X + j] = 0.0;
+	else
+		mandel[i*SIZE_X + j] = iter;
+}
+
+#endif /* __NVCC__  */
+
+template<typename T>
+void compute_gpu(std::vector<T>& mandel, int maxiter)
+{
+    dim3 grid_size(SIZE_X/GPU_BLOCK_SIZE_X + 1, SIZE_Y/GPU_BLOCK_SIZE_Y + 1);
+    dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
+}
+
+int main(void)
+{
+	size_t maxiter = 1000;
+	using T = double;
+
+	std::vector<T> mandel(SIZE_X*SIZE_Y);
+
+	auto start = std::chrono::high_resolution_clock::now();
+
+    size_t flops = 0;
+
+    #pragma omp parallel for
+	for (size_t i = 0; i < SIZE_Y; i++)
+	{
+		for (size_t j = 0; j < SIZE_X; j++)
+		{
+			T z_re = 0;
+			T z_im = 0;
+			T c_re = SX_TO_PLANE(j);                    // 4 ops
+			T c_im = SY_TO_PLANE(i);                    // 4 ops
+
+			
+			T zabs2 = (z_re * z_re) + (z_im * z_im);    // 3 ops
+			size_t iter = 0;
+                                                        // = 11 ops always
+			while (zabs2 < 4.0 && iter < maxiter)
+			{
+				T z_re2 = z_re*z_re;                    // 1 op
+				T z_im2 = z_im*z_im;                    // 1 op
+				T z_ri = 2*z_re*z_im;                   // 2 ops
+
+				z_re = z_re2 - z_im2 + c_re;            // 2 ops
+				z_im = z_ri + c_im;                     // 1 op
+
+				zabs2 = z_re2 + z_im2;                  // 1 op
+				iter++;
+			}
+                                                        // = 8 ops * iter
+			if (iter == maxiter)
+				mandel[i*SIZE_X + j] = 0.0;
+			else
+				mandel[i*SIZE_X + j] = iter;
+		}
+	}
+
+	for (size_t i = 0; i < SIZE_Y; i++)
+		for (size_t j = 0; j < SIZE_X; j++)
+            if (size_t ni = mandel[i*SIZE_X+j]; ni != 0)
+                flops += 11+8*ni;
+            else
+                flops += 11+8*maxiter;
+
+	auto stop = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double, std::milli> s = stop - start;
+
+    std::cout << s.count() << std::endl;
+    std::cout << flops << std::endl;
+
+    std::cout << "GFlops/s: " << double(flops)/(1e6*s.count()) << std::endl;
+    
+	std::ofstream ofs("mandelbrot.txt");
+	for (size_t i = 0; i < SIZE_Y; i++)
+		for (size_t j = 0; j < SIZE_X; j++)
+			ofs << j << " " << i << " " << mandel[i*SIZE_X + j] << std::endl;
+	
+
+	return 0;
+}
diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp
new file mode 100644
index 0000000..4ee808e
--- /dev/null
+++ b/kokkos-testing/test_sumbw.cpp
@@ -0,0 +1,87 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <chrono>
+#include <numeric>
+
+extern "C" {
+
+void sum_restrict(const double * __restrict__ prev,
+                  const double * __restrict__ curr,
+                  double * __restrict__ next,
+                  size_t sz)
+{
+    for (size_t i = 0; i < sz; i++)
+        next[i] = curr[i] + prev[i];
+}
+}
+
+template<typename T>
+void
+sum(const std::vector<T>& prev, const std::vector<T>& curr,
+    std::vector<T>& next)
+{
+    if ( not (prev.size() == curr.size() and curr.size() == next.size()) )
+        return;
+
+    for (size_t i = 0; i < curr.size(); i++)
+        next[i] = curr[i] + prev[i];
+}
+
+int main(void)
+{
+    using T = double;
+
+    std::vector<std::vector<double>> itertimes;
+    
+    size_t maxiter = 500;
+
+    for (size_t sz = 128; sz <= 8192; sz *= 2)
+    {
+        std::vector<T> prev(sz*sz);
+        std::vector<T> curr(sz*sz);
+        std::vector<T> next(sz*sz);
+
+        for(size_t i = 0; i < sz*sz; i++)
+        {
+            prev[i] = i + 1;
+            curr[i] = i % 18;
+            next[i] = 37*i;
+        }
+
+        std::vector<double> itertime(maxiter);
+        
+        for (size_t iter = 0; iter < maxiter; iter++)
+        {
+            auto start = std::chrono::high_resolution_clock::now();
+            sum_restrict(prev.data(), curr.data(), next.data(), sz*sz);
+            //memcpy(next.data(), curr.data(), sz*sz*sizeof(T));
+            std::swap(prev, curr);
+            std::swap(curr, next);
+            auto stop = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double, std::milli> ms = stop - start;
+            itertime[iter] = ms.count();
+        }
+
+        auto time = std::accumulate(itertime.begin(), itertime.end(), 0.0) / maxiter;
+        std::cout << "Sum bandwidth: " <<  3*sizeof(T)*sz*sz/(1e6*time);
+        std::cout << " GB/s"  << std::endl;
+
+        itertimes.push_back( std::move(itertime) );
+    }
+
+    std::ofstream ofs("itertimes.txt");
+
+    for (size_t i = 0; i < maxiter; i++)
+    {
+        for (size_t j = 0; j < itertimes.size(); j++)
+        {
+            ofs << itertimes[j][i] << " ";
+        }
+        ofs << std::endl;
+    }
+}
+
+
+
-- 
GitLab