Skip to content
Snippets Groups Projects
Commit d14736e3 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

OpenACC *really* working now.

parent 935ec216
No related branches found
No related tags found
No related merge requests found
...@@ -36,7 +36,7 @@ endif() ...@@ -36,7 +36,7 @@ endif()
find_package(OpenACC) find_package(OpenACC)
if (OpenACC_CXX_FOUND) if (OpenACC_CXX_FOUND)
set(OPENACC_LINK_LIBS ${LINK_LIBS}) set(OPENACC_LINK_LIBS ${LINK_LIBS})
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if (COMPILER_IS_GNU)
set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp) set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
endif() endif()
...@@ -44,12 +44,14 @@ if (OpenACC_CXX_FOUND) ...@@ -44,12 +44,14 @@ if (OpenACC_CXX_FOUND)
add_executable(fd_openacc_single fd_openacc.cpp) add_executable(fd_openacc_single fd_openacc.cpp)
target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION) target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
target_link_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS}) target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
endif() endif()
if (ENABLE_DOUBLE) if (ENABLE_DOUBLE)
add_executable(fd_openacc_double fd_openacc.cpp) add_executable(fd_openacc_double fd_openacc.cpp)
target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
target_link_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS}) target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
endif() endif()
endif() endif()
...@@ -79,7 +81,7 @@ endif() ...@@ -79,7 +81,7 @@ endif()
option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON) option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
if (OPT_AGGRESSIVE_FP) if (OPT_AGGRESSIVE_FP)
add_definitions(-DDISALLOW_DENORMALS) add_definitions(-DDISALLOW_DENORMALS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") if (COMPILER_IS_CLANG OR COMPILER_IS_GNU)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
endif() endif()
endif() endif()
......
...@@ -9,6 +9,19 @@ ...@@ -9,6 +9,19 @@
/* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */ /* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */
#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
/*
#pragma acc routine
template<typename T>
void wave_kernel(const T * __restrict__ d_prev,
const T * __restrict__ d_curr,
T * __restrict__ d_next,
size_t maxrow, size_t maxcol, T c, T a, T dt)
{
}
*/
template<typename T> template<typename T>
double solve_openacc(wave_equation_context<T>& wec) double solve_openacc(wave_equation_context<T>& wec)
{ {
...@@ -22,75 +35,71 @@ double solve_openacc(wave_equation_context<T>& wec) ...@@ -22,75 +35,71 @@ double solve_openacc(wave_equation_context<T>& wec)
assert(maxcol > 1); assert(maxcol > 1);
assert(maxrow > 1); assert(maxrow > 1);
double time = 0.0;
#ifdef SAVE_ITERTIME
std::ofstream ofs;
ofs.open("itertime-naive.txt");
#endif /* SAVE_ITERTIME */
/**** Initialize constants ****/ /**** Initialize constants ****/
static const T w0 = -205.0/72.0; const T w0 = -205.0/72.0;
static const T w1 = 8.0/5.0; const T w1 = 8.0/5.0;
static const T w2 = -1.0/5.0; const T w2 = -1.0/5.0;
static const T w3 = 8.0/315.0; const T w3 = 8.0/315.0;
static const T w4 = -1.0/560.0; const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
size_t nelem = wec.g_curr.size();
T *d_prev = acc_copyin(wec.g_prev.data(), nelem*sizeof(T));
T *d_curr = acc_copyin(wec.g_curr.data(), nelem*sizeof(T));
T *d_next = acc_copyin(wec.g_next.data(), nelem*sizeof(T));
#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) ) size_t nelem = wec.g_curr.size();
T * __restrict__ u_prev = (T *) acc_copyin((void*) wec.g_prev.data(), nelem*sizeof(T));
T * __restrict__ u_curr = (T *) acc_copyin((void*) wec.g_curr.data(), nelem*sizeof(T));
T * __restrict__ u_next = (T *) acc_copyin((void*) wec.g_next.data(), nelem*sizeof(T));
auto start = std::chrono::high_resolution_clock::now(); auto start = std::chrono::high_resolution_clock::now();
#pragma acc kernels #pragma acc data copyin(dt, a, c, maxcol, maxrow, w)
for (size_t iter = 0; iter < wec.maxiter; iter++) //#pragma acc data copy(u_prev[0:nelem], u_curr[0:nelem], u_next[0:nelem])
{ {
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T one_minus_adt = (1.0 - a*dt); T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt); T two_minus_adt = (2.0 - a*dt);
#pragma acc loop independent for (size_t iter = 0; iter < wec.maxiter; iter++)
{
#pragma acc parallel loop tile(32,32) deviceptr(u_prev, u_curr, u_next)
for (size_t i = 0; i < maxrow; i++) for (size_t i = 0; i < maxrow; i++)
{ {
#pragma acc loop independent
for (size_t j = 0; j < maxcol; j++) for (size_t j = 0; j < maxcol; j++)
{ {
T lapl = 0.0; T lapl = 0.0;
#pragma acc loop reduction(+:lapl)
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i,j+k) ]; lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ];
#pragma acc loop reduction(+:lapl)
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * d_curr[ U_OFFSET(i+k,j) ]; lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ];
T val = lapl - T val = lapl -
one_minus_adt * d_prev[ U_OFFSET(i,j) ] + one_minus_adt * u_prev[ U_OFFSET(i,j) ] +
two_minus_adt * d_curr[ U_OFFSET(i,j) ]; two_minus_adt * u_curr[ U_OFFSET(i,j) ];
if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
val = 0; val = 0;
d_next[ U_OFFSET(i,j) ] = val; u_next[ U_OFFSET(i,j) ] = val;
} }
} }
T *d_temp = d_prev; std::swap(u_prev, u_curr);
d_prev = d_curr; std::swap(u_curr, u_next);
d_curr = d_next;
d_next = d_temp;
} }
}
acc_copyout((void*) wec.g_prev.data(), nelem*sizeof(T));
acc_copyout((void*) wec.g_curr.data(), nelem*sizeof(T));
acc_copyout((void*) wec.g_next.data(), nelem*sizeof(T));
acc_copyout(wec.g_prev.data(), nelem*sizeof(T));
acc_copyout(wec.g_curr.data(), nelem*sizeof(T));
acc_copyout(wec.g_next.data(), nelem*sizeof(T));
auto stop = std::chrono::high_resolution_clock::now(); auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start; std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count(); double time = ms.count();
std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl; std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl;
...@@ -105,7 +114,7 @@ double solve_openacc(wave_equation_context<T>& wec) ...@@ -105,7 +114,7 @@ double solve_openacc(wave_equation_context<T>& wec)
#ifdef HAVE_SILO #ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_openacc_lastiter.silo"); visit_dump(wec.g_next, "wave_openacc_lastiter.silo");
#endif /* HAVE_SILO */ #endif /* HAVE_SILO */
return time/wec.maxiter; return time/wec.maxiter;
...@@ -123,7 +132,7 @@ int main(void) ...@@ -123,7 +132,7 @@ int main(void)
double time; double time;
for (size_t sz = 256; sz <= 256; sz *= 2) for (size_t sz = 128; sz <= 1024; sz *= 2)
{ {
std::cout << sz << std::endl; std::cout << sz << std::endl;
wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
......
...@@ -4,6 +4,8 @@ ...@@ -4,6 +4,8 @@
#include <chrono> #include <chrono>
#include <numeric> #include <numeric>
#include <cstring>
extern "C" { extern "C" {
void sum_restrict(const double * __restrict__ prev, void sum_restrict(const double * __restrict__ prev,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment