diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index b92eb2e4f806886dee7fbccec0b20f75ae2c982e..1927a55d9f6253569f711e3e1efb13d1eaa03755 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -53,11 +53,44 @@ if (ENABLE_ITERTIME_OUTPUT)
     add_definitions(-DSAVE_ITERTIME)
 endif()
 
-option(ENABLE_DAZ_FTZ "Enable Denormals Are Zero and Flush To Zero flags" ON)
-if (ENABLE_DAZ_FTZ)
+######################################################################
+## Optimization stuff
+
+option(OPT_PREFER_512bit "Prefer 512 bit vectors with AVX512 (Clang > 10 & GCC)" OFF)
+if (OPT_PREFER_512bit)
+    # https://reviews.llvm.org/D67259
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mprefer-vector-width=512")
+endif()
+
+option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
+if (OPT_AGGRESSIVE_FP)
     add_definitions(-DDISALLOW_DENORMALS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
+    endif()
 endif()
 
+option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
+if (ENABLE_VECTORIZER_REMARKS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "-PGI")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minfo")
+    endif()
+endif()
+
+######################################################################
+
 option(ENABLE_KOKKOS "Enable Kokkos" OFF)
 if (ENABLE_KOKKOS)
     FetchContent_Declare(kokkos
@@ -91,24 +124,6 @@ if (ENABLE_CUDA)
     endif()
 endif()
 
-option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
-if (ENABLE_VECTORIZER_REMARKS)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
-    endif()
-endif()
 
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -g -DNDEBUG")
diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp
index f2d3de2f9b99231bd2ae0e643aee1660194b1655..cbf749ab2255c19b5774a6afb6e9c01a8cad5db9 100644
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -29,7 +29,7 @@ int main(int argc, char **argv)
     _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
     std::cout << "Denormals: FTZ and DAZ" << std::endl;
 #endif
-    
+
     _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
     //_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM);
 
@@ -52,9 +52,9 @@ int main(int argc, char **argv)
         wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
         ofs << sz << "    ";
 
-        //wec.init();
-        //time = solve_sequential(wec);
-        //ofs << time << "    ";
+        wec.init();
+        time = solve_sequential(wec);
+        ofs << time << "    ";
 
         wec.init();
         time = solve_sequential_blocked(wec);
@@ -68,9 +68,9 @@ int main(int argc, char **argv)
 
         for (size_t threads = 1; threads < maxthreads; threads *= 2)
         {
-            //wec.init();
-            //time = solve_multithread(wec, threads);
-            //ofs << time << "    ";
+            wec.init();
+            time = solve_multithread_nopool(wec, threads);
+            ofs << time << "    ";
         }
 
         ofs << std::endl;
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index 73f3d75314de1220890fcf8148bab894a7dbd304..31b620ff97ab7f96901832de15440395e1ec5f2a 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -44,7 +44,7 @@ operator<<(std::ostream& os, const tile& tl)
 
 #define BLK_ROWS 56
 #define BLK_COLS 56
-//#define KERNEL_BLK_USE_MEMCPY
+#define KERNEL_BLK_USE_MEMCPY
 #define KERNEL_BLK_USE_TEMP
 
 template<typename T>
@@ -101,7 +101,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                 assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                 memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
 #else
-                #pragma ivdep
+                //#pragma ivdep
                 #pragma clang loop vectorize(assume_safety)
                 for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
                 {
@@ -114,7 +114,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
 #endif /* KERNEL_BLK_USE_TEMP */
             for (size_t i = 0; i < BLK_ROWS; i++)
             {
-                #pragma ivdep
+                //#pragma ivdep
                 //#pragma clang loop vectorize(assume_safety)
                 for (size_t j = 0; j < BLK_COLS; j++)
                 {
@@ -337,6 +337,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
         g_next(maxrow-1, j) = 0;
     }
 
+    #pragma loop distribute(enable)
     for (size_t i = 0; i < maxrow; i++)
     {
         g_next(i, 0) = 0;
@@ -349,7 +350,7 @@ void
 wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                fd_grid<T>& g_next,
                const wave_2D_params<T>& params,
-               size_t thread_id = 0, size_t num_threads = 1)
+               size_t from = 0, size_t to = 0)
 {
     int maxrow  = params.maxrow;
     int maxcol  = params.maxcol;
@@ -357,6 +358,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     T   c       = params.velocity;
     T   a       = params.damping;
 
+    if (to == 0)
+        to = maxrow;
+
     assert(maxcol > 1);
     assert(maxrow > 1);
 
@@ -373,8 +377,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
     T one_minus_adt = (1.0 - a*dt);
     T two_minus_adt = (2.0 - a*dt);
 
-    for (size_t i = thread_id; i < maxrow; i += num_threads)
+    for (size_t i = from; i < to; i++)
     {
+        #pragma clang loop vectorize(enable)
         for (size_t j = 0; j < maxcol; j++)
         {
             T lapl = 0.0;
@@ -614,3 +619,94 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
     return time/wec.maxiter;
 }
 
+template<typename T>
+double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
+{
+    /* Simulation parameters */
+    wave_2D_params<T> params;
+    params.maxcol = wec.g_curr.domain_cols();
+    params.maxrow = wec.g_curr.domain_rows();
+    params.dt = wec.dt;
+    params.velocity = wec.velocity;
+    params.damping = wec.damping;
+
+
+    /* Multithreading stuff */
+    std::mutex                  cv_mtx;
+    std::condition_variable     prod_cv;
+    std::condition_variable     cons_cv;
+    std::vector<bool>           thread_done(nths);
+    bool                        iteration_finished = false;
+
+    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
+#ifdef DISALLOW_DENORMALS
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+        size_t count        = params.maxrow / num_threads;
+        size_t remainder    = params.maxrow % num_threads;
+
+        size_t from = 0;
+        size_t to   = 0;
+
+        if (thread_id < remainder)
+        {
+            from    = thread_id * (count + 1);
+            to      = from + count;
+        }
+        else
+        {
+            from = thread_id * count + remainder;
+            to   = from + (count - 1);
+        }
+
+        wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to);
+    };
+
+
+    double time = 0.0;
+
+    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        std::vector<std::thread> threads(nths);
+        for (size_t i = 0; i < nths; i++)
+            threads[i] = std::thread(thread_lambda, i, nths);
+
+        for (auto& th : threads)
+            th.join();
+
+        auto stop = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> ms = stop - start;
+        time += ms.count();
+
+        std::swap(wec.g_prev, wec.g_curr);
+        std::swap(wec.g_curr, wec.g_next);
+
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (iter%100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "wave_mt_" << iter << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+    }
+    
+    std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
+    std::cout << time/wec.maxiter << "ms" << std::endl;
+    std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
+    std::cout << time << "ms" << std::endl;
+
+#ifdef HAVE_SILO
+    visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
+#endif /* HAVE_SILO */
+
+    return time/wec.maxiter;
+}
+
+
diff --git a/kokkos-testing/test_thread_overhead.cpp b/kokkos-testing/test_thread_overhead.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..291ed576071b28ff9a8b9c5adb157bfb5f35f991
--- /dev/null
+++ b/kokkos-testing/test_thread_overhead.cpp
@@ -0,0 +1,9 @@
+#include <thread>
+
+int main(int argc, char** argv)
+{
+  for (volatile int i = 0; i < 500000; i++)
+    std::thread([](){}).detach();
+  return 0;
+}
+