diff --git a/kokkos-testing/experimental_data/AllTimes_double_update.plot b/kokkos-testing/experimental_data/AllTimes_double_update.plot
new file mode 100644
index 0000000000000000000000000000000000000000..fe76eb7ad9da7bb915586d31e4d9b1c6151d4381
--- /dev/null
+++ b/kokkos-testing/experimental_data/AllTimes_double_update.plot
@@ -0,0 +1,19 @@
+set term postscript enhanced color
+set output 'AllTimes_double_update.eps'
+
+set key left top
+
+set title '2D damped wave equation - double'
+
+set xlabel 'Mesh size'
+set ylabel 'Iteration time [ms]'
+
+set style data histogram
+set style fill solid border -1
+set logscale y
+set grid ytics mytics
+
+set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
+
+plot 'AllTimes_double_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
+                        '' u 4 ti col, '' u 5 ti col, '' u 6 ti col
diff --git a/kokkos-testing/experimental_data/AllTimes_double_update.txt b/kokkos-testing/experimental_data/AllTimes_double_update.txt
new file mode 100644
index 0000000000000000000000000000000000000000..914a7e90a5970cc1e466d0608bfe020faf5cecb2
--- /dev/null
+++ b/kokkos-testing/experimental_data/AllTimes_double_update.txt
@@ -0,0 +1,5 @@
+"SIZE"  "Seq"       "SeqBlk"    "1T"        "2T"        "4T"      
+128     0.215397    0.0721867   0.231328    0.146418    0.150125  
+256     0.884406    0.284522    0.996983    0.519418    0.420706  
+512     3.49626     1.34328     3.55165     1.84903     1.3564    
+1024    13.9678     5.97425     14.4531     7.56775     5.25945   
diff --git a/kokkos-testing/experimental_data/AllTimes_float_update.plot b/kokkos-testing/experimental_data/AllTimes_float_update.plot
new file mode 100644
index 0000000000000000000000000000000000000000..9a4b0dcc9dca85a28b0d0a3db2eb99e9130a4e77
--- /dev/null
+++ b/kokkos-testing/experimental_data/AllTimes_float_update.plot
@@ -0,0 +1,20 @@
+set term postscript enhanced color
+set output 'AllTimes_float_update.eps'
+
+set key left top
+
+set title '2D damped wave equation - float'
+
+set xlabel 'Mesh size'
+set ylabel 'Iteration time [ms]'
+
+set style data histogram
+set style fill solid border -1
+set logscale y
+set grid ytics mytics
+
+set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
+
+plot 'AllTimes_float_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
+                        '' u 4 ti col, '' u 5 ti col, '' u 6 ti col
+
diff --git a/kokkos-testing/experimental_data/AllTimes_float_update.txt b/kokkos-testing/experimental_data/AllTimes_float_update.txt
new file mode 100644
index 0000000000000000000000000000000000000000..77e1555fffafebe8abc5a21e92eda620d3eb4540
--- /dev/null
+++ b/kokkos-testing/experimental_data/AllTimes_float_update.txt
@@ -0,0 +1,6 @@
+"SIZE"  "Seq"       "SeqBlk"    "1T"        "2T"        "4T"     
+128     0.285575    0.0328261   0.266828    0.159186    0.1409   
+256     1.15701     0.13435     1.07333     0.531615    0.387834 
+512     4.65854     0.57531     4.21764     2.03206     1.42824  
+1024    13.9774     3.15193     16.5969     8.72612     5.83051  
+
diff --git a/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt b/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e9da9dcf92ffc1e9a17734a5bdfaa2fc422f4a3f
--- /dev/null
+++ b/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt
@@ -0,0 +1,6 @@
+"Size"  "GT650M"    "GTX760"    "i7-3615QM"
+128     13.062      45.9258     13.1641 
+256     16.8322     62.7176     13.3595
+512     17.469      67.3603     11.3188
+1024    17.0132     68.6386     10.1799
+
diff --git a/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot b/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot
new file mode 100644
index 0000000000000000000000000000000000000000..69ac71c52bd341f1979bb879766f0916a648a391
--- /dev/null
+++ b/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot
@@ -0,0 +1,14 @@
+set term postscript enhanced color font ',20'
+set output 'i7-3615QM_gflops_double_update.eps'
+
+set title "i7-3615QM SeqBlk double"
+set logscale x
+set yrange [3:20]
+set xlabel "Mesh size"
+set ylabel "GFlops/s"
+
+set label "Intel declared: 18.4 GFlops/s" at 150,18.8
+set label "DGEMM: 3.83 GFlops/s" at 150,4.2
+set arrow 1 from 128,18.4 to 1024,18.4 nohead dt '.'
+set arrow 2 from 128,3.83 to 1024,3.83 nohead dt '.'
+plot 'CUDA_gflops_double_update.txt' using 1:4:xtic(1) w lp lw 2 ti 'SeqBlk GFlops/s'
diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index 82e3739f43b8a8c49f27fe321ce23cf35d0ec7ce..b92eb2e4f806886dee7fbccec0b20f75ae2c982e 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -21,18 +21,22 @@ endif()
 
 find_package(OpenACC)
 if (OpenACC_CXX_FOUND)
+    set(OPENACC_LINK_LIBS ${LINK_LIBS})
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
+    endif()
     
     if (ENABLE_SINGLE)
         add_executable(fd_openacc_single fd_openacc.cpp)
         target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
         target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
-        target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp)
+        target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
     endif()
 
     if (ENABLE_DOUBLE)
         add_executable(fd_openacc_double fd_openacc.cpp)
         target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
-        target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp)
+        target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
     endif()
 endif()
 
@@ -87,10 +91,22 @@ if (ENABLE_CUDA)
     endif()
 endif()
 
-option(ENABLE_VECTORIZER_REMARKS "Enable Clang vectorizer remarks" ON)
+option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
 if (ENABLE_VECTORIZER_REMARKS)
     if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
     endif()
 endif()
 
diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp
index e0d50d513e0be19e1b81046613364b4a217b33b7..f2d3de2f9b99231bd2ae0e643aee1660194b1655 100644
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -29,8 +29,9 @@ int main(int argc, char **argv)
     _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
     std::cout << "Denormals: FTZ and DAZ" << std::endl;
 #endif
+    
     _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
-
+    //_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM);
 
     /* Make header */
     ofs << "\"SIZE\"    \"Seq\"    \"SeqBlk\"    ";
@@ -51,9 +52,9 @@ int main(int argc, char **argv)
         wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
         ofs << sz << "    ";
 
-        wec.init();
-        time = solve_sequential(wec);
-        ofs << time << "    ";
+        //wec.init();
+        //time = solve_sequential(wec);
+        //ofs << time << "    ";
 
         wec.init();
         time = solve_sequential_blocked(wec);
@@ -67,9 +68,9 @@ int main(int argc, char **argv)
 
         for (size_t threads = 1; threads < maxthreads; threads *= 2)
         {
-            wec.init();
-            time = solve_multithread(wec, threads);
-            ofs << time << "    ";
+            //wec.init();
+            //time = solve_multithread(wec, threads);
+            //ofs << time << "    ";
         }
 
         ofs << std::endl;
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index 06f452c625a99d9577ac0ff5257b9d956f9615c5..73f3d75314de1220890fcf8148bab894a7dbd304 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -44,6 +44,8 @@ operator<<(std::ostream& os, const tile& tl)
 
 #define BLK_ROWS 56
 #define BLK_COLS 56
+//#define KERNEL_BLK_USE_MEMCPY
+#define KERNEL_BLK_USE_TEMP
 
 template<typename T>
 void
@@ -51,9 +53,6 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                fd_grid<T>& g_next,
                const wave_2D_params<T>& params)
 {
-    /* 1) memcpy is faster than for
-     * 2) if you put inner block processing, you kill perf
-     */
     int maxrow  = params.maxrow;
     int maxcol  = params.maxcol;
     T   dt      = params.dt;
@@ -71,10 +70,12 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     static const T w4 = -1.0/560.0;
     static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
 
+#ifdef KERNEL_BLK_USE_TEMP
     T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
+#endif
 
-    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
+    T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
     T one_minus_adt = (1.0 - a*dt);
     T two_minus_adt = (2.0 - a*dt);
 
@@ -91,19 +92,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
         {
             /* Here all the dimensions are known at compile time, let the
              * compiler do its job. */
+#ifdef KERNEL_BLK_USE_TEMP
             for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
             {
+#ifdef KERNEL_BLK_USE_MEMCPY
                 int ofs_x = bj - WAVE_8_HALO_SIZE;
                 int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                 assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                 memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+                #pragma ivdep
+                #pragma clang loop vectorize(assume_safety)
+                for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
+                {
+                    int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                    s_curr[i][j] = g_curr(ofs_y, ofs_x);
+                }
+#endif /* KERNEL_BLK_USE_MEMCPY */
             }
-
+#endif /* KERNEL_BLK_USE_TEMP */
             for (size_t i = 0; i < BLK_ROWS; i++)
             {
+                #pragma ivdep
+                //#pragma clang loop vectorize(assume_safety)
                 for (size_t j = 0; j < BLK_COLS; j++)
                 {
                     T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                     for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                         lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
             
@@ -111,8 +127,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                         lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
 
                     T val = lapl -
-                            one_minus_adt * g_prev(bi+i, bj+j) +
-                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                     
                     assert(j < BLK_COLS);
                     g_next(bi+i, bj+j) = val;
@@ -124,22 +151,37 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     /* Do upper blocks */
     {
         size_t bi = blocks_y*BLK_ROWS;
+
         for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
         {
-            for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
+#ifdef KERNEL_BLK_USE_TEMP
+            for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
             {
-                
+#ifdef KERNEL_BLK_USE_MEMCPY
                 int ofs_x = bj - WAVE_8_HALO_SIZE;
                 int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                 assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                 memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+                #pragma clang loop vectorize(assume_safety)
+                for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
+                {
+                    int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                    s_curr[i][j] = g_curr(ofs_y, ofs_x);
+                }
+#endif
             }
+#endif /* KERNEL_BLK_USE_TEMP */
 
             for (size_t i = 0; i < rem_y; i++)
             {
+                #pragma ivdep
+                //#pragma clang loop vectorize(assume_safety)
                 for (size_t j = 0; j < BLK_COLS; j++)
                 {
                     T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                     for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                         lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
             
@@ -147,8 +189,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                         lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
 
                     T val = lapl -
-                            one_minus_adt * g_prev(bi+i, bj+j) +
-                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                     
                     assert(j < BLK_COLS);
                     g_next(bi+i, bj+j) = val;
@@ -162,29 +215,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
         size_t bi = blocks_y*BLK_ROWS;
         size_t bj = blocks_x*BLK_COLS;
 
-        for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
+#ifdef KERNEL_BLK_USE_TEMP
+        for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
         {
-            
+#ifdef KERNEL_BLK_USE_MEMCPY
             int ofs_x = bj - WAVE_8_HALO_SIZE;
             int ofs_y = bi + i - WAVE_8_HALO_SIZE;
             assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
-            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+            #pragma clang loop vectorize(assume_safety)
+            for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
+            {
+                int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                s_curr[i][j] = g_curr(ofs_y, ofs_x);
+            }
+#endif
         }
+#endif /* KERNEL_BLK_USE_TEMP */
 
         for (size_t i = 0; i < rem_y; i++)
         {
+            #pragma ivdep
+            //#pragma clang loop vectorize(assume_safety)
             for (size_t j = 0; j < rem_x; j++)
             {
                 T lapl = 0.0;
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
-        
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+#ifdef KERNEL_BLK_USE_TEMP
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
 
-                T val = lapl -
+                    T val = lapl -
                         one_minus_adt * g_prev(bi+i, bj+j) +
                         two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                 
                 assert(j < BLK_COLS);
                 g_next(bi+i, bj+j) = val;
@@ -197,29 +275,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     {
         size_t bj = blocks_x*BLK_COLS;
 
+#ifdef KERNEL_BLK_USE_TEMP
         for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
         {
-            
+#ifdef KERNEL_BLK_USE_MEMCPY
             int ofs_x = bj - WAVE_8_HALO_SIZE;
             int ofs_y = bi + i - WAVE_8_HALO_SIZE;
             assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
-            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+            #pragma clang loop vectorize(assume_safety)
+            for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
+            {      
+                int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                s_curr[i][j] = g_curr(ofs_y, ofs_x);
+            }
+#endif
         }
+#endif /* KERNEL_BLK_USE_TEMP */
 
         for (size_t i = 0; i < BLK_ROWS; i++)
         {
+            #pragma ivdep
+            //#pragma clang loop vectorize(assume_safety)
             for (size_t j = 0; j < rem_x; j++)
             {
                 T lapl = 0.0;
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
-        
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
+#ifdef KERNEL_BLK_USE_TEMP
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
 
-                T val = lapl -
+                    T val = lapl -
                         one_minus_adt * g_prev(bi+i, bj+j) +
                         two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                 
                 assert(j < BLK_COLS);
                 g_next(bi+i, bj+j) = val;
@@ -265,8 +368,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     static const T w4 = -1.0/560.0;
     static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
 
-    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
+    T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
     T one_minus_adt = (1.0 - a*dt);
     T two_minus_adt = (2.0 - a*dt);
 
@@ -281,7 +384,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
             for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                 lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);
 
-            T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
+            T val = lapl - one_minus_adt * g_prev(i, j);
+            val += two_minus_adt * g_curr(i, j);
 
             if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                 val = 0;
diff --git a/kokkos-testing/test_mandel.cpp b/kokkos-testing/test_mandel.cpp
index 6365842e07564d5f6f6eaf940039c1882efc8398..832b340a6b1825eea31513dbe043ddb94e492161 100644
--- a/kokkos-testing/test_mandel.cpp
+++ b/kokkos-testing/test_mandel.cpp
@@ -3,6 +3,9 @@
 #include <vector>
 #include <chrono>
 
+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
 #define YMIN (-1.5)
 #define YMAX (1.5)
 #define XMIN (-2.5)
@@ -56,8 +59,6 @@ void gpu_mandel(T *mandel, int maxiter)
 		mandel[i*SIZE_X + j] = iter;
 }
 
-#endif /* __NVCC__  */
-
 template<typename T>
 void compute_gpu(std::vector<T>& mandel, int maxiter)
 {
@@ -65,10 +66,12 @@ void compute_gpu(std::vector<T>& mandel, int maxiter)
     dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
 }
 
+#endif /* __NVCC__  */
+
 int main(void)
 {
 	size_t maxiter = 1000;
-	using T = double;
+	using T = float;
 
 	std::vector<T> mandel(SIZE_X*SIZE_Y);
 
@@ -76,7 +79,9 @@ int main(void)
 
     size_t flops = 0;
 
-    #pragma omp parallel for
+    //_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    //_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
 	for (size_t i = 0; i < SIZE_Y; i++)
 	{
 		for (size_t j = 0; j < SIZE_X; j++)