Updated experimental data.

d965a1d0 · Matteo Cicuttin · f2b143db · d965a1d0 · d965a1d0 · d965a1d0
Commit d965a1d0 authored 5 years ago by Matteo Cicuttin
--- a/kokkos-testing/experimental_data/AllTimes_double_update.plot
+++ b/kokkos-testing/experimental_data/AllTimes_double_update.plot
+set term postscript enhanced color
+set output 'AllTimes_double_update.eps'
+
+set key left top
+
+set title '2D damped wave equation - double'
+
+set xlabel 'Mesh size'
+set ylabel 'Iteration time [ms]'
+
+set style data histogram
+set style fill solid border -1
+set logscale y
+set grid ytics mytics
+
+set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
+
+plot 'AllTimes_double_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
+                        '' u 4 ti col, '' u 5 ti col, '' u 6 ti col
--- a/kokkos-testing/experimental_data/AllTimes_double_update.txt
+++ b/kokkos-testing/experimental_data/AllTimes_double_update.txt
+"SIZE"  "Seq"       "SeqBlk"    "1T"        "2T"        "4T"      
+128     0.215397    0.0721867   0.231328    0.146418    0.150125  
+256     0.884406    0.284522    0.996983    0.519418    0.420706  
+512     3.49626     1.34328     3.55165     1.84903     1.3564    
+1024    13.9678     5.97425     14.4531     7.56775     5.25945   
--- a/kokkos-testing/experimental_data/AllTimes_float_update.plot
+++ b/kokkos-testing/experimental_data/AllTimes_float_update.plot
+set term postscript enhanced color
+set output 'AllTimes_float_update.eps'
+
+set key left top
+
+set title '2D damped wave equation - float'
+
+set xlabel 'Mesh size'
+set ylabel 'Iteration time [ms]'
+
+set style data histogram
+set style fill solid border -1
+set logscale y
+set grid ytics mytics
+
+set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
+
+plot 'AllTimes_float_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
+                        '' u 4 ti col, '' u 5 ti col, '' u 6 ti col
+
--- a/kokkos-testing/experimental_data/AllTimes_float_update.txt
+++ b/kokkos-testing/experimental_data/AllTimes_float_update.txt
+"SIZE"  "Seq"       "SeqBlk"    "1T"        "2T"        "4T"     
+128     0.285575    0.0328261   0.266828    0.159186    0.1409   
+256     1.15701     0.13435     1.07333     0.531615    0.387834 
+512     4.65854     0.57531     4.21764     2.03206     1.42824  
+1024    13.9774     3.15193     16.5969     8.72612     5.83051  
+
--- a/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt
+++ b/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt
+"Size"  "GT650M"    "GTX760"    "i7-3615QM"
+128     13.062      45.9258     13.1641 
+256     16.8322     62.7176     13.3595
+512     17.469      67.3603     11.3188
+1024    17.0132     68.6386     10.1799
+
--- a/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot
+++ b/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot
+set term postscript enhanced color font ',20'
+set output 'i7-3615QM_gflops_double_update.eps'
+
+set title "i7-3615QM SeqBlk double"
+set logscale x
+set yrange [3:20]
+set xlabel "Mesh size"
+set ylabel "GFlops/s"
+
+set label "Intel declared: 18.4 GFlops/s" at 150,18.8
+set label "DGEMM: 3.83 GFlops/s" at 150,4.2
+set arrow 1 from 128,18.4 to 1024,18.4 nohead dt '.'
+set arrow 2 from 128,3.83 to 1024,3.83 nohead dt '.'
+plot 'CUDA_gflops_double_update.txt' using 1:4:xtic(1) w lp lw 2 ti 'SeqBlk GFlops/s'
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -21,18 +21,22 @@ endif()

 find_package(OpenACC)
 if (OpenACC_CXX_FOUND)
+    set(OPENACC_LINK_LIBS ${LINK_LIBS})
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
+    endif()
    
    if (ENABLE_SINGLE)
        add_executable(fd_openacc_single fd_openacc.cpp)
        target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
        target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
-        target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp)
+        target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
    endif()

    if (ENABLE_DOUBLE)
        add_executable(fd_openacc_double fd_openacc.cpp)
        target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
-        target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp)
+        target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
    endif()
 endif()

@@ -87,10 +91,22 @@ if (ENABLE_CUDA)
    endif()
 endif()

-option(ENABLE_VECTORIZER_REMARKS "Enable Clang vectorizer remarks" ON)
+option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
 if (ENABLE_VECTORIZER_REMARKS)
    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
    endif()
 endif()


--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -29,8 +29,9 @@ int main(int argc, char **argv)
    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
    std::cout << "Denormals: FTZ and DAZ" << std::endl;
 #endif
-    _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
    
+    _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
+    //_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM);

    /* Make header */
    ofs << "\"SIZE\"    \"Seq\"    \"SeqBlk\"    ";
@@ -51,9 +52,9 @@ int main(int argc, char **argv)
        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
        ofs << sz << "    ";

-        wec.init();
-        time = solve_sequential(wec);
-        ofs << time << "    ";
+        //wec.init();
+        //time = solve_sequential(wec);
+        //ofs << time << "    ";

        wec.init();
        time = solve_sequential_blocked(wec);
@@ -67,9 +68,9 @@ int main(int argc, char **argv)

        for (size_t threads = 1; threads < maxthreads; threads *= 2)
        {
-            wec.init();
-            time = solve_multithread(wec, threads);
-            ofs << time << "    ";
+            //wec.init();
+            //time = solve_multithread(wec, threads);
+            //ofs << time << "    ";
        }

        ofs << std::endl;

--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -44,6 +44,8 @@ operator<<(std::ostream& os, const tile& tl)

 #define BLK_ROWS 56
 #define BLK_COLS 56
+//#define KERNEL_BLK_USE_MEMCPY
+#define KERNEL_BLK_USE_TEMP

 template<typename T>
 void
@@ -51,9 +53,6 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
               fd_grid<T>& g_next,
               const wave_2D_params<T>& params)
 {
-    /* 1) memcpy is faster than for
-     * 2) if you put inner block processing, you kill perf
-     */
    int maxrow  = params.maxrow;
    int maxcol  = params.maxcol;
    T   dt      = params.dt;
@@ -71,10 +70,12 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    static const T w4 = -1.0/560.0;
    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };

+#ifdef KERNEL_BLK_USE_TEMP
    T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
+#endif

-    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
+    T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

@@ -91,19 +92,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
        {
            /* Here all the dimensions are known at compile time, let the
             * compiler do its job. */
+#ifdef KERNEL_BLK_USE_TEMP
            for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
            {
+#ifdef KERNEL_BLK_USE_MEMCPY
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+                #pragma ivdep
+                #pragma clang loop vectorize(assume_safety)
+                for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
+                {
+                    int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                    s_curr[i][j] = g_curr(ofs_y, ofs_x);
                }
-
+#endif /* KERNEL_BLK_USE_MEMCPY */
+            }
+#endif /* KERNEL_BLK_USE_TEMP */
            for (size_t i = 0; i < BLK_ROWS; i++)
            {
+                #pragma ivdep
+                //#pragma clang loop vectorize(assume_safety)
                for (size_t j = 0; j < BLK_COLS; j++)
                {
                    T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
@@ -113,6 +129,17 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                    T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
@@ -124,22 +151,37 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    /* Do upper blocks */
    {
        size_t bi = blocks_y*BLK_ROWS;
+
        for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
        {
-            for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
+#ifdef KERNEL_BLK_USE_TEMP
+            for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
            {
-                
+#ifdef KERNEL_BLK_USE_MEMCPY
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+                #pragma clang loop vectorize(assume_safety)
+                for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
+                {
+                    int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                    s_curr[i][j] = g_curr(ofs_y, ofs_x);
+                }
+#endif
            }
+#endif /* KERNEL_BLK_USE_TEMP */

            for (size_t i = 0; i < rem_y; i++)
            {
+                #pragma ivdep
+                //#pragma clang loop vectorize(assume_safety)
                for (size_t j = 0; j < BLK_COLS; j++)
                {
                    T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
@@ -149,6 +191,17 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                    T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
@@ -162,20 +215,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
        size_t bi = blocks_y*BLK_ROWS;
        size_t bj = blocks_x*BLK_COLS;

-        for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
+#ifdef KERNEL_BLK_USE_TEMP
+        for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
        {
-            
+#ifdef KERNEL_BLK_USE_MEMCPY
            int ofs_x = bj - WAVE_8_HALO_SIZE;
            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
-            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+            #pragma clang loop vectorize(assume_safety)
+            for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
+            {
+                int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                s_curr[i][j] = g_curr(ofs_y, ofs_x);
+            }
+#endif
        }
+#endif /* KERNEL_BLK_USE_TEMP */

        for (size_t i = 0; i < rem_y; i++)
        {
+            #pragma ivdep
+            //#pragma clang loop vectorize(assume_safety)
            for (size_t j = 0; j < rem_x; j++)
            {
                T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
@@ -185,6 +252,17 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                    T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                
                assert(j < BLK_COLS);
                g_next(bi+i, bj+j) = val;
@@ -197,20 +275,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    {
        size_t bj = blocks_x*BLK_COLS;

+#ifdef KERNEL_BLK_USE_TEMP
        for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
        {
-            
+#ifdef KERNEL_BLK_USE_MEMCPY
            int ofs_x = bj - WAVE_8_HALO_SIZE;
            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
-            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
+            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
+#else
+            #pragma clang loop vectorize(assume_safety)
+            for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
+            {      
+                int ofs_x = bj + j - WAVE_8_HALO_SIZE;
+                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
+                s_curr[i][j] = g_curr(ofs_y, ofs_x);
            }
+#endif
+        }
+#endif /* KERNEL_BLK_USE_TEMP */

        for (size_t i = 0; i < BLK_ROWS; i++)
        {
+            #pragma ivdep
+            //#pragma clang loop vectorize(assume_safety)
            for (size_t j = 0; j < rem_x; j++)
            {
                T lapl = 0.0;
+#ifdef KERNEL_BLK_USE_TEMP
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
@@ -220,6 +312,17 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                    T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
+#else
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
+            
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
+
+                    T val = lapl -
+                        one_minus_adt * g_prev(bi+i, bj+j) +
+                        two_minus_adt * g_curr(bi+i, bj+j);
+#endif /* KERNEL_BLK_USE_TEMP */
                
                assert(j < BLK_COLS);
                g_next(bi+i, bj+j) = val;
@@ -265,8 +368,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    static const T w4 = -1.0/560.0;
    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };

-    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
-    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+    T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
+    T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

@@ -281,7 +384,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);

-            T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
+            T val = lapl - one_minus_adt * g_prev(i, j);
+            val += two_minus_adt * g_curr(i, j);

            if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                val = 0;

--- a/kokkos-testing/test_mandel.cpp
+++ b/kokkos-testing/test_mandel.cpp
@@ -3,6 +3,9 @@
 #include <vector>
 #include <chrono>

+#include <pmmintrin.h>
+#include <xmmintrin.h>
+
 #define YMIN (-1.5)
 #define YMAX (1.5)
 #define XMIN (-2.5)
@@ -56,8 +59,6 @@ void gpu_mandel(T *mandel, int maxiter)
 		mandel[i*SIZE_X + j] = iter;
 }

-#endif /* __NVCC__  */
-
 template<typename T>
 void compute_gpu(std::vector<T>& mandel, int maxiter)
 {
@@ -65,10 +66,12 @@ void compute_gpu(std::vector<T>& mandel, int maxiter)
    dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
 }

+#endif /* __NVCC__  */
+
 int main(void)
 {
 	size_t maxiter = 1000;
-	using T = double;
+	using T = float;

 	std::vector<T> mandel(SIZE_X*SIZE_Y);

@@ -76,7 +79,9 @@ int main(void)

    size_t flops = 0;

-    #pragma omp parallel for
+    //_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    //_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+
 	for (size_t i = 0; i < SIZE_Y; i++)
 	{
 		for (size_t j = 0; j < SIZE_X; j++)