diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp
index cbf749ab2255c19b5774a6afb6e9c01a8cad5db9..0f1a8c1a6f38952dcacb28be665f6e303553e5d2 100644
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -65,11 +65,10 @@ int main(int argc, char **argv)
         time = solve_cuda(wec);
         ofs << time << "    ";
 #endif
-
         for (size_t threads = 1; threads < maxthreads; threads *= 2)
         {
             wec.init();
-            time = solve_multithread_nopool(wec, threads);
+            time = solve_multithread(wec, threads);
             ofs << time << "    ";
         }
 
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index 31b620ff97ab7f96901832de15440395e1ec5f2a..e82cebff0ea61d70072b74f672a99159c4325636 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -12,6 +12,7 @@
 #include <cmath>
 #include <algorithm>
 #include <sstream>
+#include <atomic>
 
 #include <string.h>
 #include <pmmintrin.h>
@@ -350,7 +351,7 @@ void
 wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                fd_grid<T>& g_next,
                const wave_2D_params<T>& params,
-               size_t from = 0, size_t to = 0)
+               size_t from = 0, size_t to = 1)
 {
     int maxrow  = params.maxrow;
     int maxcol  = params.maxcol;
@@ -358,12 +359,12 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     T   c       = params.velocity;
     T   a       = params.damping;
 
-    if (to == 0)
-        to = maxrow;
-
     assert(maxcol > 1);
     assert(maxrow > 1);
 
+    //if (to == 0)
+    //    to = maxrow-1;
+
     /**** Initialize constants ****/
     static const T w0 = -205.0/72.0;
     static const T w1 = 8.0/5.0;
@@ -377,7 +378,7 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     T one_minus_adt = (1.0 - a*dt);
     T two_minus_adt = (2.0 - a*dt);
 
-    for (size_t i = from; i < to; i++)
+    for (size_t i = from; i < maxrow; i+=to)
     {
         #pragma clang loop vectorize(enable)
         for (size_t j = 0; j < maxcol; j++)
@@ -509,6 +510,18 @@ double solve_sequential_blocked(wave_equation_context<T>& wec)
     return solve_sequential_aux<T,true>(wec);
 }
 
+class SpinLock {
+    std::atomic_flag locked = ATOMIC_FLAG_INIT ;
+public:
+    void lock() {
+        while (locked.test_and_set(std::memory_order_acquire)) { ; }
+    }
+    void unlock() {
+        locked.clear(std::memory_order_release);
+    }
+};
+
+
 template<typename T>
 double solve_multithread(wave_equation_context<T>& wec, size_t nths)
 {
@@ -526,6 +539,7 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
     std::condition_variable     prod_cv;
     std::condition_variable     cons_cv;
     std::vector<bool>           thread_done(nths);
+    std::vector<double>         times(nths);
     bool                        iteration_finished = false;
 
     auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
@@ -546,12 +560,16 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
             }
 
             /* Do the timestep */
+            auto start = std::chrono::high_resolution_clock::now();
             wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);
+            auto stop = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double, std::milli> ms = stop - start;
 
             /* Work for this thread finished, notify producer */
             std::unique_lock<std::mutex> lck(cv_mtx);
             prod_cv.notify_one();
             thread_done[thread_id] = true;
+            times[thread_id] += ms.count();
         }
     };
 
@@ -607,11 +625,23 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
     for (auto& th : threads)
         th.join();
     
+    double itertime = time/wec.maxiter;
+
     std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
-    std::cout << time/wec.maxiter << "ms" << std::endl;
+    std::cout << itertime << "ms" << std::endl;
     std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
     std::cout << time << "ms" << std::endl;
 
+    for (auto& t : times)
+    {
+        double t_itertime = t/wec.maxiter;
+        double t_overhead = (time - t)/wec.maxiter;
+        double t_overhead_percent = 100.0*t_overhead/itertime;
+        std::cout << "  Thread time: " << t_itertime;
+        std::cout << " overhead: " << t_overhead << " (";
+        std::cout << t_overhead_percent << "%)" << std::endl;
+    }
+
 #ifdef HAVE_SILO
     visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
 #endif /* HAVE_SILO */
@@ -639,6 +669,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
     bool                        iteration_finished = false;
 
     auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
+        /*
 #ifdef DISALLOW_DENORMALS
         _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
         _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
@@ -662,6 +693,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
         }
 
         wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to);
+        */
     };
 
 
@@ -675,8 +707,8 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
         for (size_t i = 0; i < nths; i++)
             threads[i] = std::thread(thread_lambda, i, nths);
 
-        for (auto& th : threads)
-            th.join();
+        for (size_t i = 0; i < nths; i++)
+            threads[i].join();
 
         auto stop = std::chrono::high_resolution_clock::now();
         std::chrono::duration<double, std::milli> ms = stop - start;
diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp
index 4ee808e32961ed9631dc5fec2d1ae1c969106406..d1271a71a473c2171b16252a575cf28dd4e8b293 100644
--- a/kokkos-testing/test_sumbw.cpp
+++ b/kokkos-testing/test_sumbw.cpp
@@ -54,8 +54,8 @@ int main(void)
         for (size_t iter = 0; iter < maxiter; iter++)
         {
             auto start = std::chrono::high_resolution_clock::now();
-            sum_restrict(prev.data(), curr.data(), next.data(), sz*sz);
-            //memcpy(next.data(), curr.data(), sz*sz*sizeof(T));
+            //sum_restrict(prev.data(), curr.data(), next.data(), sz*sz);
+            memcpy(next.data(), curr.data(), sz*sz*sizeof(T));
             std::swap(prev, curr);
             std::swap(curr, next);
             auto stop = std::chrono::high_resolution_clock::now();
@@ -65,7 +65,7 @@ int main(void)
         }
 
         auto time = std::accumulate(itertime.begin(), itertime.end(), 0.0) / maxiter;
-        std::cout << "Sum bandwidth: " <<  3*sizeof(T)*sz*sz/(1e6*time);
+        std::cout << "Sum bandwidth: " <<  2*sizeof(T)*sz*sz/(1e6*time);
         std::cout << " GB/s"  << std::endl;
 
         itertimes.push_back( std::move(itertime) );