OpenACC *really* working now.

OpenACC really working now.
d14736e3 · Matteo Cicuttin · 935ec216 · d14736e3 · d14736e3 · d14736e3
Commit d14736e3 authored 5 years ago by Matteo Cicuttin
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -36,7 +36,7 @@ endif()
 find_package(OpenACC)
 if (OpenACC_CXX_FOUND)
    set(OPENACC_LINK_LIBS ${LINK_LIBS})
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (COMPILER_IS_GNU)
        set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
    endif()
@@ -44,12 +44,14 @@ if (OpenACC_CXX_FOUND)
        add_executable(fd_openacc_single fd_openacc.cpp)
        target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
        target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
        target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
    endif()
    if (ENABLE_DOUBLE)
        add_executable(fd_openacc_double fd_openacc.cpp)
        target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
+        target_link_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
        target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
    endif()
 endif()
@@ -79,7 +81,7 @@ endif()
 option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
 if (OPT_AGGRESSIVE_FP)
    add_definitions(-DDISALLOW_DENORMALS)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    if (COMPILER_IS_CLANG OR COMPILER_IS_GNU)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
    endif()
 endif()

--- a/kokkos-testing/fd_catalog/fd_openacc.cpp
+++ b/kokkos-testing/fd_catalog/fd_openacc.cpp
@@ -9,6 +9,19 @@
 /* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */
+#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
+/*
+#pragma acc routine
+template<typename T>
+void wave_kernel(const T * __restrict__ d_prev,
+                 const T * __restrict__ d_curr,
+                 T * __restrict__ d_next,
+                size_t maxrow, size_t maxcol, T c, T a, T dt)
+{
+}
+*/
 template<typename T>
 double solve_openacc(wave_equation_context<T>& wec)
 {
@@ -22,75 +35,71 @@ double solve_openacc(wave_equation_context<T>& wec)
    assert(maxcol > 1);
    assert(maxrow > 1);
-    double time = 0.0;
-#ifdef SAVE_ITERTIME
-    std::ofstream ofs;   
-    ofs.open("itertime-naive.txt");
-#endif /* SAVE_ITERTIME */
    /**** Initialize constants ****/
-    static const T w0 = -205.0/72.0;
+    const T w0 = -205.0/72.0;
-    static const T w1 = 8.0/5.0;
+    const T w1 = 8.0/5.0;
-    static const T w2 = -1.0/5.0;
+    const T w2 = -1.0/5.0;
-    static const T w3 = 8.0/315.0;
+    const T w3 = 8.0/315.0;
-    static const T w4 = -1.0/560.0;
+    const T w4 = -1.0/560.0;
-    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
+    const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
-    size_t nelem    = wec.g_curr.size();
-	T *d_prev = acc_copyin(wec.g_prev.data(), nelem*sizeof(T));
-	T *d_curr = acc_copyin(wec.g_curr.data(), nelem*sizeof(T));
-	T *d_next = acc_copyin(wec.g_next.data(), nelem*sizeof(T));
-#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
+    size_t nelem    = wec.g_curr.size();
+    T * __restrict__ u_prev = (T *) acc_copyin((void*) wec.g_prev.data(), nelem*sizeof(T));
+    T * __restrict__ u_curr = (T *) acc_copyin((void*) wec.g_curr.data(), nelem*sizeof(T));
+    T * __restrict__ u_next = (T *) acc_copyin((void*) wec.g_next.data(), nelem*sizeof(T));
    auto start = std::chrono::high_resolution_clock::now();
-#pragma acc kernels
+    #pragma acc data copyin(dt, a, c, maxcol, maxrow, w)
-    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    //#pragma acc data copy(u_prev[0:nelem], u_curr[0:nelem], u_next[0:nelem])
    {
        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
        T one_minus_adt = (1.0 - a*dt);
        T two_minus_adt = (2.0 - a*dt);
-        #pragma acc loop independent
+        for (size_t iter = 0; iter < wec.maxiter; iter++)
+        {
+            #pragma acc parallel loop tile(32,32) deviceptr(u_prev, u_curr, u_next)
            for (size_t i = 0; i < maxrow; i++)
            {
-            #pragma acc loop independent
                for (size_t j = 0; j < maxcol; j++)
                {
                    T lapl = 0.0;
+                    #pragma acc loop reduction(+:lapl)
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i,j+k) ];
+                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ];
+                    #pragma acc loop reduction(+:lapl)
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
-                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i+k,j) ];
+                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ];
                    T val = lapl - 
-                        one_minus_adt * d_prev[ U_OFFSET(i,j) ] + 
+                        one_minus_adt * u_prev[ U_OFFSET(i,j) ] + 
-                        two_minus_adt * d_curr[ U_OFFSET(i,j) ];
+                        two_minus_adt * u_curr[ U_OFFSET(i,j) ];
                    if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                        val = 0;
-                d_next[ U_OFFSET(i,j) ] = val;
+                    u_next[ U_OFFSET(i,j) ] = val;
                }
            }
-		T *d_temp = d_prev;
+            std::swap(u_prev, u_curr);
-		d_prev = d_curr;
+            std::swap(u_curr, u_next);
-		d_curr = d_next;
-		d_next = d_temp;
        }
+	}
+    acc_copyout((void*) wec.g_prev.data(), nelem*sizeof(T));
+    acc_copyout((void*) wec.g_curr.data(), nelem*sizeof(T));
+    acc_copyout((void*) wec.g_next.data(), nelem*sizeof(T));
-	acc_copyout(wec.g_prev.data(), nelem*sizeof(T));
-	acc_copyout(wec.g_curr.data(), nelem*sizeof(T));
-	acc_copyout(wec.g_next.data(), nelem*sizeof(T));
    auto stop = std::chrono::high_resolution_clock::now();
    std::chrono::duration<double, std::milli> ms = stop - start;
-    time += ms.count();
+    double time = ms.count();
    std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
    std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl;
@@ -105,7 +114,7 @@ double solve_openacc(wave_equation_context<T>& wec)
 #ifdef HAVE_SILO
-    visit_dump(wec.g_curr, "wave_openacc_lastiter.silo");
+    visit_dump(wec.g_next, "wave_openacc_lastiter.silo");
 #endif /* HAVE_SILO */
    return time/wec.maxiter;
@@ -123,7 +132,7 @@ int main(void)
    double time;
-	for (size_t sz = 256; sz <= 256; sz *= 2)
+	for (size_t sz = 128; sz <= 1024; sz *= 2)
    {
        std::cout << sz << std::endl;
        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);

--- a/kokkos-testing/test_sumbw.cpp
+++ b/kokkos-testing/test_sumbw.cpp
@@ -4,6 +4,8 @@
 #include <chrono>
 #include <numeric>
+#include <cstring>
 extern "C" {
 void sum_restrict(const double * __restrict__ prev,