diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index 8d3d2f2493c4b30d8d3f7b544274d3eacc8d90d7..ee0b1ebe836f50b7cf1e9ce98300dfec45f0d6e4 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -36,20 +36,22 @@ endif() find_package(OpenACC) if (OpenACC_CXX_FOUND) set(OPENACC_LINK_LIBS ${LINK_LIBS}) - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (COMPILER_IS_GNU) set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp) endif() - + if (ENABLE_SINGLE) add_executable(fd_openacc_single fd_openacc.cpp) target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION) target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS}) endif() if (ENABLE_DOUBLE) add_executable(fd_openacc_double fd_openacc.cpp) target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS}) endif() endif() @@ -79,7 +81,7 @@ endif() option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON) if (OPT_AGGRESSIVE_FP) add_definitions(-DDISALLOW_DENORMALS) - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if (COMPILER_IS_CLANG OR COMPILER_IS_GNU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") endif() endif() diff --git a/kokkos-testing/fd_catalog/fd_openacc.cpp b/kokkos-testing/fd_catalog/fd_openacc.cpp index 0d80c0c89a3e0e6e0f3827864ffa72131ad8faa8..d282a762f7dfc6d282a759efea707a4af18b6849 100644 --- a/kokkos-testing/fd_catalog/fd_openacc.cpp +++ b/kokkos-testing/fd_catalog/fd_openacc.cpp @@ -9,6 +9,19 @@ /* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */ +#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) ) + +/* +#pragma acc routine +template<typename T> +void wave_kernel(const T * __restrict__ d_prev, + const T * __restrict__ d_curr, + T * __restrict__ d_next, + size_t maxrow, size_t maxcol, T c, T a, T dt) +{ +} +*/ + template<typename T> double solve_openacc(wave_equation_context<T>& wec) { @@ -22,75 +35,71 @@ double solve_openacc(wave_equation_context<T>& wec) assert(maxcol > 1); assert(maxrow > 1); - double time = 0.0; - -#ifdef SAVE_ITERTIME - std::ofstream ofs; - ofs.open("itertime-naive.txt"); -#endif /* SAVE_ITERTIME */ - /**** Initialize constants ****/ - static const T w0 = -205.0/72.0; - static const T w1 = 8.0/5.0; - static const T w2 = -1.0/5.0; - static const T w3 = 8.0/315.0; - static const T w4 = -1.0/560.0; - static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + const T w0 = -205.0/72.0; + const T w1 = 8.0/5.0; + const T w2 = -1.0/5.0; + const T w3 = 8.0/315.0; + const T w4 = -1.0/560.0; + const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; - size_t nelem = wec.g_curr.size(); - T *d_prev = acc_copyin(wec.g_prev.data(), nelem*sizeof(T)); - T *d_curr = acc_copyin(wec.g_curr.data(), nelem*sizeof(T)); - T *d_next = acc_copyin(wec.g_next.data(), nelem*sizeof(T)); -#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) ) + size_t nelem = wec.g_curr.size(); + T * __restrict__ u_prev = (T *) acc_copyin((void*) wec.g_prev.data(), nelem*sizeof(T)); + T * __restrict__ u_curr = (T *) acc_copyin((void*) wec.g_curr.data(), nelem*sizeof(T)); + T * __restrict__ u_next = (T *) acc_copyin((void*) wec.g_next.data(), nelem*sizeof(T)); auto start = std::chrono::high_resolution_clock::now(); -#pragma acc kernels - for (size_t iter = 0; iter < wec.maxiter; iter++) + #pragma acc data copyin(dt, a, c, maxcol, maxrow, w) + //#pragma acc data copy(u_prev[0:nelem], u_curr[0:nelem], u_next[0:nelem]) { + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); T one_minus_adt = (1.0 - a*dt); T two_minus_adt = (2.0 - a*dt); - #pragma acc loop independent - for (size_t i = 0; i < maxrow; i++) + for (size_t iter = 0; iter < wec.maxiter; iter++) { - #pragma acc loop independent - for (size_t j = 0; j < maxcol; j++) + #pragma acc parallel loop tile(32,32) deviceptr(u_prev, u_curr, u_next) + for (size_t i = 0; i < maxrow; i++) { - T lapl = 0.0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i,j+k) ]; + for (size_t j = 0; j < maxcol; j++) + { + T lapl = 0.0; + #pragma acc loop reduction(+:lapl) + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ]; + + #pragma acc loop reduction(+:lapl) + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ]; + + T val = lapl - + one_minus_adt * u_prev[ U_OFFSET(i,j) ] + + two_minus_adt * u_curr[ U_OFFSET(i,j) ]; + + if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) + val = 0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i+k,j) ]; - - T val = lapl - - one_minus_adt * d_prev[ U_OFFSET(i,j) ] + - two_minus_adt * d_curr[ U_OFFSET(i,j) ]; - - if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) - val = 0; - - d_next[ U_OFFSET(i,j) ] = val; + u_next[ U_OFFSET(i,j) ] = val; + } } + + std::swap(u_prev, u_curr); + std::swap(u_curr, u_next); } + } + + acc_copyout((void*) wec.g_prev.data(), nelem*sizeof(T)); + acc_copyout((void*) wec.g_curr.data(), nelem*sizeof(T)); + acc_copyout((void*) wec.g_next.data(), nelem*sizeof(T)); - T *d_temp = d_prev; - d_prev = d_curr; - d_curr = d_next; - d_next = d_temp; - } - - acc_copyout(wec.g_prev.data(), nelem*sizeof(T)); - acc_copyout(wec.g_curr.data(), nelem*sizeof(T)); - acc_copyout(wec.g_next.data(), nelem*sizeof(T)); auto stop = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> ms = stop - start; - time += ms.count(); + double time = ms.count(); std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl; @@ -105,7 +114,7 @@ double solve_openacc(wave_equation_context<T>& wec) #ifdef HAVE_SILO - visit_dump(wec.g_curr, "wave_openacc_lastiter.silo"); + visit_dump(wec.g_next, "wave_openacc_lastiter.silo"); #endif /* HAVE_SILO */ return time/wec.maxiter; @@ -123,7 +132,7 @@ int main(void) double time; - for (size_t sz = 256; sz <= 256; sz *= 2) + for (size_t sz = 128; sz <= 1024; sz *= 2) { std::cout << sz << std::endl; wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp index d1271a71a473c2171b16252a575cf28dd4e8b293..d2a6b9335fbe4b0b0fce1ce379749da5f2ba8544 100644 --- a/kokkos-testing/test_sumbw.cpp +++ b/kokkos-testing/test_sumbw.cpp @@ -4,6 +4,8 @@ #include <chrono> #include <numeric> +#include <cstring> + extern "C" { void sum_restrict(const double * __restrict__ prev,