diff --git a/kokkos-testing/experimental_data/makefigs.sh b/kokkos-testing/experimental_data/makefigs.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e236c4703e0ecd74c45de6060a77604874ba4429
--- /dev/null
+++ b/kokkos-testing/experimental_data/makefigs.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+for file in *.plot; do
+    gnuplot $file
+done
+
+for file in *.eps; do
+    ps2pdf $file
+done
+
+for file in *.pdf; do
+    pdfcrop $file
+done
+
+for file in *-crop.pdf; do
+    mv $file $(echo $file | sed 's/-crop//')
+done
diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index 2f765c79618dd1674de131666fb6a987899f612a..742d534bc875cb4f0bfbebe74189bf289cb8901b 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -1,31 +1,61 @@
-cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
 project(fd_catalog)
 include(CheckLanguage)
 include(FetchContent)
 
+set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
+set(LINK_LIBS ${LINK_LIBS} Threads::Threads)
+
+find_package(SILO)
+if (SILO_FOUND)
+    add_definitions(-DHAVE_SILO)
+    include_directories("${SILO_INCLUDE_DIR}")
+    set(LINK_LIBS ${LINK_LIBS} ${SILO_LIBRARY})
+endif()
+
+option(ENABLE_TIMESTEP_OUTPUT "Save timesteps (don't use during perf meas)" OFF)
+if (ENABLE_TIMESTEP_OUTPUT)
+    if (NOT SILO_FOUND)
+        message(FATAL_ERROR "You need SILO to output iteration data")
+    endif()
+    add_definitions(-DSAVE_TIMESTEPS)
+endif()
+
+option(ENABLE_ITERTIME_OUTPUT "Save iteration times (don't use during perf meas)" OFF)
+if (ENABLE_ITERTIME_OUTPUT)
+    add_definitions(-DSAVE_ITERTIME)
+endif()
 
-option(ENABLE_KOKKOS "Enable Kokkos" ON)
+option(ENABLE_KOKKOS "Enable Kokkos" OFF)
 if (ENABLE_KOKKOS)
     FetchContent_Declare(kokkos
         GIT_REPOSITORY https://github.com/kokkos/kokkos.git
     )
     FetchContent_MakeAvailable(kokkos)
 
-    add_executable(finite_difference_kokkos finite_difference_kokkos.cpp)
-    target_link_libraries(finite_difference_kokkos Kokkos::kokkos siloh5)
-    #add_definitions(-DHAVE_KOKKOS)
-    #set(HAVE_KOKKOS TRUE)
+    if (ENABLE_SINGLE)
+        add_executable(fd_kokkos_single fd_kokkos.cpp)
+        target_compile_definitions(fd_kokkos_single PUBLIC -DSINGLE_PRECISION)
+        target_link_libraries(fd_kokkos_single ${LINK_LIBS} Kokkos::kokkos)
+    endif()
+
+    if (ENABLE_DOUBLE)
+        add_executable(fd_kokkos_double fd_kokkos.cpp)
+        target_link_libraries(fd_kokkos_double ${LINK_LIBS} Kokkos::kokkos)
+    endif()
+
+    set(HAVE_KOKKOS TRUE)
 endif()
 
 option(ENABLE_CUDA "Enable CUDA if present" ON)
 if (ENABLE_CUDA)
     check_language(CUDA)
-
     if (CMAKE_CUDA_COMPILER)
         enable_language(CUDA)
         set(CMAKE_CUDA_STANDARD 14)
@@ -35,41 +65,45 @@ if (ENABLE_CUDA)
     endif()
 endif()
 
+option(ENABLE_VECTORIZER_REMARKS "Enable Clang vectorizer remarks" ON)
+if (ENABLE_VECTORIZER_REMARKS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize")
+    endif()
+endif()
+
 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -g -DNDEBUG")
 set(CMAKE_CXX_FLAGS_RELEASEASSERT "-O3 -march=native -g -fpermissive")
 
-option(A_SAVE_TS "Save timesteps" OFF)
-if (A_SAVE_TS)
-    add_definitions(-DHAVE_SILO)
-    add_definitions(-DSAVE_TIMESTEPS)
-endif()
+macro(setup_fd_catalog_target FD_TGT_NAME SINGLE_PRECISION)
+    set(FD_SOURCES "fd_main.cpp")
 
-option(ENABLE_SINGLE "Enable single precision build" ON)
-if (ENABLE_SINGLE)
     if (HAVE_CUDA)
-        add_executable(fd_catalog_single fd_main.cpp fd_wave_cuda.cu)
-        if(APPLE)
-            set_property(TARGET fd_catalog_single PROPERTY BUILD_RPATH /usr/local/cuda/lib)
-        endif()
-    else()
-        add_executable(fd_catalog_single fd_main.cpp)
+        set(FD_SOURCES ${FD_SOURCES} fd_wave_cuda.cu)
+    endif()
+
+    add_executable(${FD_TGT_NAME} ${FD_SOURCES})
+
+    if (HAVE_CUDA AND APPLE)
+        set_property(TARGET ${FD_TGT_NAME} PROPERTY BUILD_RPATH /usr/local/cuda/lib)
+    endif()
+    
+    if (SINGLE_PRECISION)
+        target_compile_definitions(${SP_TARGET_NAME} PUBLIC -DSINGLE_PRECISION)
     endif()
-    target_compile_definitions(fd_catalog_single PUBLIC -DSINGLE_PRECISION)
-    target_link_libraries(fd_catalog_single siloh5)
-    target_link_libraries(fd_catalog_single Threads::Threads)
+
+    target_link_libraries(${FD_TGT_NAME} ${LINK_LIBS})
+endmacro()
+
+option(ENABLE_SINGLE "Enable single precision build" ON)
+if (ENABLE_SINGLE)
+    setup_fd_catalog_target("fd_catalog_single" TRUE)
 endif()
 
 option(ENABLE_DOUBLE "Enable double precision build" ON)
 if (ENABLE_DOUBLE)
-    if (HAVE_CUDA)
-        add_executable(fd_catalog_double fd_main.cpp fd_wave_cuda.cu)
-        if(APPLE)
-            set_property(TARGET fd_catalog_double PROPERTY BUILD_RPATH /usr/local/cuda/lib)
-        endif()
-    else()
-        add_executable(fd_catalog_double fd_main.cpp)
-    endif()
-    target_link_libraries(fd_catalog_double siloh5)
-    target_link_libraries(fd_catalog_double Threads::Threads)
+    setup_fd_catalog_target("fd_catalog_double" FALSE)
 endif()
+
+
diff --git a/kokkos-testing/fd_catalog/cmake/FindSILO.cmake b/kokkos-testing/fd_catalog/cmake/FindSILO.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..c1144bf762fcd39b2eac0e808576065b3dc98261
--- /dev/null
+++ b/kokkos-testing/fd_catalog/cmake/FindSILO.cmake
@@ -0,0 +1,12 @@
+include(FindPackageHandleStandardArgs)
+
+
+find_path(SILO_INCLUDE_DIR
+          NAMES silo.h
+          PATHS /usr/include /usr/local/include)
+
+find_library(SILO_LIBRARY
+             NAMES silo siloh5
+             PATHS /usr/lib /usr/local/lib)
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(SILO DEFAULT_MSG SILO_LIBRARY SILO_INCLUDE_DIR)
diff --git a/kokkos-testing/fd_catalog/finite_difference_kokkos.cpp b/kokkos-testing/fd_catalog/fd_kokkos.cpp
similarity index 100%
rename from kokkos-testing/fd_catalog/finite_difference_kokkos.cpp
rename to kokkos-testing/fd_catalog/fd_kokkos.cpp
diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp
index 458d937dc852cb1ba2418a821ad57116df1a07f5..4f104c3c0e97d64a2560789a5d137643646eb6aa 100644
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -12,12 +12,12 @@ int main(void)
 {
 #ifdef SINGLE_PRECISION
     using T = float;
+    std::ofstream ofs("timings-float.txt");
 #else
     using T = double;
+    std::ofstream ofs("timings-double.txt");
 #endif
 
-    std::ofstream ofs("timings.txt");
-
     /* Make header */
     ofs << "\"SIZE\"    \"Seq\"    \"SeqBlk\"    ";
 
@@ -32,9 +32,9 @@ int main(void)
 
     double time;
 
-    for (size_t sz = 128; sz <= 1024; sz *= 2)
+    for (size_t sz = 256; sz <= 256; sz *= 2)
     {
-        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
+        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.001, 5000);
         ofs << sz << "    ";
 
         wec.init();
@@ -51,8 +51,6 @@ int main(void)
         ofs << time << "    ";
 #endif
 
-        auto maxthreads = std::thread::hardware_concurrency();
-
         for (size_t threads = 1; threads < maxthreads; threads *= 2)
         {
             wec.init();
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index 63e60e14184b974bcffc51fb134b1fda76928b16..96e11c13ec853aab7098e2ebd571e6838982895d 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -309,14 +309,14 @@ solve_sequential_aux(wave_equation_context<T>& wec)
 
     double time = 0.0;
 
-    std::ofstream ofs;
-
-    /*
+#ifdef SAVE_ITERTIME
+    std::ofstream ofs;   
     if (blocked)
         ofs.open("itertime-blocked.txt");
     else
         ofs.open("itertime-naive.txt");
-    */
+#endif /* SAVE_ITERTIME */
+
     for (size_t i = 0; i < wec.maxiter; i++)
     {
         auto start = std::chrono::high_resolution_clock::now();
@@ -339,13 +339,18 @@ solve_sequential_aux(wave_equation_context<T>& wec)
         if ( (i%100) == 0 )
         {
             std::stringstream ss;
-            ss << "wave_seq_" << i << ".silo";
+            if (blocked)
+                ss << "wave_seqblk_" << i << ".silo";
+            else
+                ss << "wave_seq_" << i << ".silo";
             visit_dump(wec.g_curr, ss.str());
         }
 #endif /* SAVE_TIMESTEPS */
 #endif /* HAVE_SILO */
 
-        //ofs << i << " " << ms.count() << std::endl;
+#ifdef SAVE_ITERTIME
+        ofs << i << " " << ms.count() << std::endl;
+#endif /* SAVE_ITERTIME */
     }
     
     if (blocked)
@@ -370,8 +375,12 @@ solve_sequential_aux(wave_equation_context<T>& wec)
         double gbytes_s = kernel_bytes/(1e6*itertime);
         std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
     }
+
 #ifdef HAVE_SILO
-    visit_dump(wec.g_curr, "wave_sequential_lastiter.silo");
+    if (blocked)
+        visit_dump(wec.g_curr, "wave_seqblk_lastiter.silo");
+    else
+        visit_dump(wec.g_curr, "wave_seq_lastiter.silo");
 #endif /* HAVE_SILO */
 
     return time/wec.maxiter;
@@ -489,7 +498,7 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
     std::cout << time << "ms" << std::endl;
 
 #ifdef HAVE_SILO
-    visit_dump(wec.g_curr, "wave_tiled_lastiter.silo");
+    visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
 #endif /* HAVE_SILO */
 
     return time/wec.maxiter;
diff --git a/kokkos-testing/fd_catalog/fd_wave_cuda.cu b/kokkos-testing/fd_catalog/fd_wave_cuda.cu
index 065444a5b15788f81c3e2878cb0b78b14e9a440a..cb03028e44975510a46c5a8e8f2e3d563e989e5c 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cuda.cu
+++ b/kokkos-testing/fd_catalog/fd_wave_cuda.cu
@@ -232,7 +232,8 @@ double solve_cuda_aux(wave_equation_context<T>& wec)
     std::cout << "[Wave][Cuda] Wall Time: " << milliseconds << "ms" << std::endl;
     
     double itertime = milliseconds/wec.maxiter;
-    double gflops_s = 70*(params.maxrow*params.maxcol)/(1e6*itertime);
+
+    double gflops_s = 70.0*(params.maxrow*params.maxcol)/(1e6*itertime);
     std::cout << "[Wave][Cuda] GFlops/s: " << gflops_s << std::endl;
 
     size_t kb1 = (2*WAVE_8_HALO_SIZE+WAVE_8_KER_ROWS)*(2*WAVE_8_HALO_SIZE+WAVE_8_KER_COLS);