diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index df923f68de2f7e889442673acecf1f90dddc88e6..82e3739f43b8a8c49f27fe321ce23cf35d0ec7ce 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -21,7 +21,19 @@ endif()
 
 find_package(OpenACC)
 if (OpenACC_CXX_FOUND)
-    add_definitions(-DHAVE_OPENACC)
+    
+    if (ENABLE_SINGLE)
+        add_executable(fd_openacc_single fd_openacc.cpp)
+        target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
+        target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp)
+    endif()
+
+    if (ENABLE_DOUBLE)
+        add_executable(fd_openacc_double fd_openacc.cpp)
+        target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp)
+    endif()
 endif()
 
 option(ENABLE_TIMESTEP_OUTPUT "Save timesteps (don't use during perf meas)" OFF)
diff --git a/kokkos-testing/fd_catalog/fd_dataio.hpp b/kokkos-testing/fd_catalog/fd_dataio.hpp
index 2fe9efc31ef86761252a78521b94754021a7b22c..289c90757f475c4691fbab5f96f9109a0745e473 100644
--- a/kokkos-testing/fd_catalog/fd_dataio.hpp
+++ b/kokkos-testing/fd_catalog/fd_dataio.hpp
@@ -20,7 +20,7 @@ visit_dump(const fd_grid<T>& g, const std::string& fn)
     static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float");
 
     DBfile *db = nullptr;
-    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5);
+    db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_PDB);
     if (!db)
     {
         std::cout << "Cannot write simulation output" << std::endl;
diff --git a/kokkos-testing/fd_catalog/fd_openacc.cpp b/kokkos-testing/fd_catalog/fd_openacc.cpp
index 9a857a17916ac9cbf7231c0025d980095d2a52b8..ad4ffea78b2a8149b4be3a7445e7b5f83a0292e5 100644
--- a/kokkos-testing/fd_catalog/fd_openacc.cpp
+++ b/kokkos-testing/fd_catalog/fd_openacc.cpp
@@ -5,9 +5,123 @@
 
 #include "fd_wave_cpu.hpp"
 
+/* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */
+
 template<typename T>
-double solve_openacc(wave_equation_context<T>& wec, size_t nths)
+double solve_openacc(wave_equation_context<T>& wec)
 {
+    /* Simulation parameters */
+    int maxcol = wec.g_curr.domain_cols();
+    int maxrow = wec.g_curr.domain_rows();
+    T dt = wec.dt;
+    T c = wec.velocity;
+    T a = wec.damping;
+
+    assert(maxcol > 1);
+    assert(maxrow > 1);
+
+    double time = 0.0;
+
+#ifdef SAVE_ITERTIME
+    std::ofstream ofs;   
+    ofs.open("itertime-naive.txt");
+#endif /* SAVE_ITERTIME */
+
+    /**** Initialize constants ****/
+    static const T w0 = -205.0/72.0;
+    static const T w1 = 8.0/5.0;
+    static const T w2 = -1.0/5.0;
+    static const T w3 = 8.0/315.0;
+    static const T w4 = -1.0/560.0;
+    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+
+    T * __restrict__ u_prev       = wec.g_prev.data();
+    T * __restrict__ u_curr       = wec.g_curr.data();
+    T * __restrict__ u_next       = wec.g_next.data();
+    size_t nelem    = wec.g_curr.size();
+
+#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
+
+//#pragma acc data copy(u_prev[0:nelem])
+//#pragma acc data copy(u_curr[0:nelem])
+//#pragma acc data copy(u_next[0:nelem])
+    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
+        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
+        T one_minus_adt = (1.0 - a*dt);
+        T two_minus_adt = (2.0 - a*dt);
+
+#pragma omp parallel for shared(maxrow, maxcol, u_prev, u_curr, u_next)
+#pragma acc kernels
+        #pragma acc loop independent
+        for (size_t i = 0; i < maxrow; i++)
+        {
+            #pragma acc loop independent
+            for (size_t j = 0; j < maxcol; j++)
+            {
+                T lapl = 0.0;
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ];
+        
+                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ];
+
+                T val = lapl - 
+                        one_minus_adt * u_prev[ U_OFFSET(i,j) ] + 
+                        two_minus_adt * u_curr[ U_OFFSET(i,j) ];
+
+                if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
+                    val = 0;
+                
+                u_next[ U_OFFSET(i,j) ] = val;
+            }
+        }
+
+        auto stop = std::chrono::high_resolution_clock::now();
+
+        std::chrono::duration<double, std::milli> ms = stop - start;
+        time += ms.count();
+        
+        std::swap(u_prev, u_curr);
+        std::swap(u_curr, u_next);
+
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (iter%100) == 0 )
+        {
+            //#pragma acc update self(u_curr[0:nelem])
+            std::stringstream ss;
+            ss << "wave_openacc_" << iter << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+
+#ifdef SAVE_ITERTIME
+        ofs << i << " " << ms.count() << std::endl;
+#endif /* SAVE_ITERTIME */
+    }
+    
+    std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
+    std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl;
+    
+    double itertime = time/wec.maxiter;
+    double gflops_s = 58*(maxrow*maxcol)/(1e6*itertime);
+    std::cout << "[Wave][OpenACC] GFlops/s: " << gflops_s << std::endl;
+
+    size_t kernel_bytes = 3*sizeof(T)*(maxrow*maxcol);
+    double gbytes_s = kernel_bytes/(1e6*itertime);
+    std::cout << "[Wave][OpenACC] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
+    
+
+#ifdef HAVE_SILO
+    visit_dump(wec.g_curr, "wave_openacc_lastiter.silo");
+#endif /* HAVE_SILO */
+
+    return time/wec.maxiter;
 }
 
 int main(void)
@@ -20,8 +134,11 @@ int main(void)
     std::cout << "Precision: double" << std::endl;
 #endif
 
-	for (size_t sz = 128; sz <= 1024; sz *= 2)
+    double time;
+
+	for (size_t sz = 256; sz <= 256; sz *= 2)
     {
+        std::cout << sz << std::endl;
         wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
 
         wec.init();
@@ -29,4 +146,6 @@ int main(void)
     }
 
     return 0;
-}
\ No newline at end of file
+}
+
+
diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
index 152c2b19ccfc7cc03e3bfb33fbddfa8b2a5f7b88..06f452c625a99d9577ac0ff5257b9d956f9615c5 100644
--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -14,6 +14,8 @@
 #include <sstream>
 
 #include <string.h>
+#include <pmmintrin.h>
+#include <xmmintrin.h>
 
 #include "fd_grid.hpp"
 #include "fd_dataio.hpp"