From d14736e3b050d6f923124076710a78a3ca4868a5 Mon Sep 17 00:00:00 2001
From: Matteo Cicuttin <datafl4sh@toxicnet.eu>
Date: Tue, 31 Mar 2020 14:46:25 +0200
Subject: [PATCH] OpenACC *really* working now.

---
 kokkos-testing/fd_catalog/CMakeLists.txt |   8 +-
 kokkos-testing/fd_catalog/fd_openacc.cpp | 109 ++++++++++++-----------
 kokkos-testing/test_sumbw.cpp            |   2 +
 3 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt
index 8d3d2f2..ee0b1eb 100644
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -36,20 +36,22 @@ endif()
 find_package(OpenACC)
 if (OpenACC_CXX_FOUND)
     set(OPENACC_LINK_LIBS ${LINK_LIBS})
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (COMPILER_IS_GNU)
         set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
     endif()
-    
+
     if (ENABLE_SINGLE)
         add_executable(fd_openacc_single fd_openacc.cpp)
         target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
         target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
         target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
     endif()
 
     if (ENABLE_DOUBLE)
         add_executable(fd_openacc_double fd_openacc.cpp)
         target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
         target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
     endif()
 endif()
@@ -79,7 +81,7 @@ endif()
 option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
 if (OPT_AGGRESSIVE_FP)
     add_definitions(-DDISALLOW_DENORMALS)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (COMPILER_IS_CLANG OR COMPILER_IS_GNU)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
     endif()
 endif()
diff --git a/kokkos-testing/fd_catalog/fd_openacc.cpp b/kokkos-testing/fd_catalog/fd_openacc.cpp
index 0d80c0c..d282a76 100644
--- a/kokkos-testing/fd_catalog/fd_openacc.cpp
+++ b/kokkos-testing/fd_catalog/fd_openacc.cpp
@@ -9,6 +9,19 @@
 
 /* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */
 
+#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
+
+/*
+#pragma acc routine
+template<typename T>
+void wave_kernel(const T * __restrict__ d_prev,
+                 const T * __restrict__ d_curr,
+                 T * __restrict__ d_next,
+                size_t maxrow, size_t maxcol, T c, T a, T dt)
+{
+}
+*/
+
 template<typename T>
 double solve_openacc(wave_equation_context<T>& wec)
 {
@@ -22,75 +35,71 @@ double solve_openacc(wave_equation_context<T>& wec)
     assert(maxcol > 1);
     assert(maxrow > 1);
 
-    double time = 0.0;
-
-#ifdef SAVE_ITERTIME
-    std::ofstream ofs;   
-    ofs.open("itertime-naive.txt");
-#endif /* SAVE_ITERTIME */
-
     /**** Initialize constants ****/
-    static const T w0 = -205.0/72.0;
-    static const T w1 = 8.0/5.0;
-    static const T w2 = -1.0/5.0;
-    static const T w3 = 8.0/315.0;
-    static const T w4 = -1.0/560.0;
-    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    const T w0 = -205.0/72.0;
+    const T w1 = 8.0/5.0;
+    const T w2 = -1.0/5.0;
+    const T w3 = 8.0/315.0;
+    const T w4 = -1.0/560.0;
+    const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
 
-    size_t nelem    = wec.g_curr.size();
-	T *d_prev = acc_copyin(wec.g_prev.data(), nelem*sizeof(T));
-	T *d_curr = acc_copyin(wec.g_curr.data(), nelem*sizeof(T));
-	T *d_next = acc_copyin(wec.g_next.data(), nelem*sizeof(T));
 
-#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
+    size_t nelem    = wec.g_curr.size();
+    T * __restrict__ u_prev = (T *) acc_copyin((void*) wec.g_prev.data(), nelem*sizeof(T));
+    T * __restrict__ u_curr = (T *) acc_copyin((void*) wec.g_curr.data(), nelem*sizeof(T));
+    T * __restrict__ u_next = (T *) acc_copyin((void*) wec.g_next.data(), nelem*sizeof(T));
 
     auto start = std::chrono::high_resolution_clock::now();
 
-#pragma acc kernels
-    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    #pragma acc data copyin(dt, a, c, maxcol, maxrow, w)
+    //#pragma acc data copy(u_prev[0:nelem], u_curr[0:nelem], u_next[0:nelem])
     {
+        
         T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
         T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
         T one_minus_adt = (1.0 - a*dt);
         T two_minus_adt = (2.0 - a*dt);
 
-        #pragma acc loop independent
-        for (size_t i = 0; i < maxrow; i++)
+        for (size_t iter = 0; iter < wec.maxiter; iter++)
         {
-            #pragma acc loop independent
-            for (size_t j = 0; j < maxcol; j++)
+            #pragma acc parallel loop tile(32,32) deviceptr(u_prev, u_curr, u_next)
+            for (size_t i = 0; i < maxrow; i++)
             {
-                T lapl = 0.0;
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i,j+k) ];
+                for (size_t j = 0; j < maxcol; j++)
+                {
+                    T lapl = 0.0;
+                    #pragma acc loop reduction(+:lapl)
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ];
+
+                    #pragma acc loop reduction(+:lapl)
+                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ];
+
+                    T val = lapl - 
+                        one_minus_adt * u_prev[ U_OFFSET(i,j) ] + 
+                        two_minus_adt * u_curr[ U_OFFSET(i,j) ];
+
+                    if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
+                        val = 0;
         
-                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i+k,j) ];
-
-                T val = lapl - 
-                        one_minus_adt * d_prev[ U_OFFSET(i,j) ] + 
-                        two_minus_adt * d_curr[ U_OFFSET(i,j) ];
-
-                if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
-                    val = 0;
-                
-                d_next[ U_OFFSET(i,j) ] = val;
+                    u_next[ U_OFFSET(i,j) ] = val;
+                }
             }
+
+            std::swap(u_prev, u_curr);
+            std::swap(u_curr, u_next);
         }
+	}
+
+    acc_copyout((void*) wec.g_prev.data(), nelem*sizeof(T));
+    acc_copyout((void*) wec.g_curr.data(), nelem*sizeof(T));
+    acc_copyout((void*) wec.g_next.data(), nelem*sizeof(T));
 
-		T *d_temp = d_prev;
-		d_prev = d_curr;
-		d_curr = d_next;
-		d_next = d_temp;
-    }
-	
-	acc_copyout(wec.g_prev.data(), nelem*sizeof(T));
-	acc_copyout(wec.g_curr.data(), nelem*sizeof(T));
-	acc_copyout(wec.g_next.data(), nelem*sizeof(T));
     auto stop = std::chrono::high_resolution_clock::now();
 
     std::chrono::duration<double, std::milli> ms = stop - start;
-    time += ms.count();
+    double time = ms.count();
     
     std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
     std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl;
@@ -105,7 +114,7 @@ double solve_openacc(wave_equation_context<T>& wec)
     
 
 #ifdef HAVE_SILO
-    visit_dump(wec.g_curr, "wave_openacc_lastiter.silo");
+    visit_dump(wec.g_next, "wave_openacc_lastiter.silo");
 #endif /* HAVE_SILO */
 
     return time/wec.maxiter;
@@ -123,7 +132,7 @@ int main(void)
 
     double time;
 
-	for (size_t sz = 256; sz <= 256; sz *= 2)
+	for (size_t sz = 128; sz <= 1024; sz *= 2)
     {
         std::cout << sz << std::endl;
         wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp
index d1271a7..d2a6b93 100644
--- a/kokkos-testing/test_sumbw.cpp
+++ b/kokkos-testing/test_sumbw.cpp
@@ -4,6 +4,8 @@
 #include <chrono>
 #include <numeric>
 
+#include <cstring>
+
 extern "C" {
 
 void sum_restrict(const double * __restrict__ prev,
-- 
GitLab