From f2b143dbeefdf1635810f1f2176e428302464d8f Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Thu, 26 Mar 2020 18:56:39 +0100 Subject: [PATCH] OpenACC working but slow. --- kokkos-testing/fd_catalog/CMakeLists.txt | 14 ++- kokkos-testing/fd_catalog/fd_dataio.hpp | 2 +- kokkos-testing/fd_catalog/fd_openacc.cpp | 125 +++++++++++++++++++++- kokkos-testing/fd_catalog/fd_wave_cpu.hpp | 2 + 4 files changed, 138 insertions(+), 5 deletions(-) diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index df923f6..82e3739 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -21,7 +21,19 @@ endif() find_package(OpenACC) if (OpenACC_CXX_FOUND) - add_definitions(-DHAVE_OPENACC) + + if (ENABLE_SINGLE) + add_executable(fd_openacc_single fd_openacc.cpp) + target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION) + target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp) + endif() + + if (ENABLE_DOUBLE) + add_executable(fd_openacc_double fd_openacc.cpp) + target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp) + endif() endif() option(ENABLE_TIMESTEP_OUTPUT "Save timesteps (don't use during perf meas)" OFF) diff --git a/kokkos-testing/fd_catalog/fd_dataio.hpp b/kokkos-testing/fd_catalog/fd_dataio.hpp index 2fe9efc..289c907 100644 --- a/kokkos-testing/fd_catalog/fd_dataio.hpp +++ b/kokkos-testing/fd_catalog/fd_dataio.hpp @@ -20,7 +20,7 @@ visit_dump(const fd_grid<T>& g, const std::string& fn) static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float"); DBfile *db = nullptr; - db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5); + db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_PDB); if (!db) { std::cout << "Cannot write simulation output" << std::endl; diff --git a/kokkos-testing/fd_catalog/fd_openacc.cpp b/kokkos-testing/fd_catalog/fd_openacc.cpp index 9a857a1..ad4ffea 100644 --- a/kokkos-testing/fd_catalog/fd_openacc.cpp +++ b/kokkos-testing/fd_catalog/fd_openacc.cpp @@ -5,9 +5,123 @@ #include "fd_wave_cpu.hpp" +/* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */ + template<typename T> -double solve_openacc(wave_equation_context<T>& wec, size_t nths) +double solve_openacc(wave_equation_context<T>& wec) { + /* Simulation parameters */ + int maxcol = wec.g_curr.domain_cols(); + int maxrow = wec.g_curr.domain_rows(); + T dt = wec.dt; + T c = wec.velocity; + T a = wec.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + double time = 0.0; + +#ifdef SAVE_ITERTIME + std::ofstream ofs; + ofs.open("itertime-naive.txt"); +#endif /* SAVE_ITERTIME */ + + /**** Initialize constants ****/ + static const T w0 = -205.0/72.0; + static const T w1 = 8.0/5.0; + static const T w2 = -1.0/5.0; + static const T w3 = 8.0/315.0; + static const T w4 = -1.0/560.0; + static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + T * __restrict__ u_prev = wec.g_prev.data(); + T * __restrict__ u_curr = wec.g_curr.data(); + T * __restrict__ u_next = wec.g_next.data(); + size_t nelem = wec.g_curr.size(); + +#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) ) + +//#pragma acc data copy(u_prev[0:nelem]) +//#pragma acc data copy(u_curr[0:nelem]) +//#pragma acc data copy(u_next[0:nelem]) + for (size_t iter = 0; iter < wec.maxiter; iter++) + { + auto start = std::chrono::high_resolution_clock::now(); + + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); + +#pragma omp parallel for shared(maxrow, maxcol, u_prev, u_curr, u_next) +#pragma acc kernels + #pragma acc loop independent + for (size_t i = 0; i < maxrow; i++) + { + #pragma acc loop independent + for (size_t j = 0; j < maxcol; j++) + { + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ]; + + T val = lapl - + one_minus_adt * u_prev[ U_OFFSET(i,j) ] + + two_minus_adt * u_curr[ U_OFFSET(i,j) ]; + + if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) + val = 0; + + u_next[ U_OFFSET(i,j) ] = val; + } + } + + auto stop = std::chrono::high_resolution_clock::now(); + + std::chrono::duration<double, std::milli> ms = stop - start; + time += ms.count(); + + std::swap(u_prev, u_curr); + std::swap(u_curr, u_next); + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (iter%100) == 0 ) + { + //#pragma acc update self(u_curr[0:nelem]) + std::stringstream ss; + ss << "wave_openacc_" << iter << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + +#ifdef SAVE_ITERTIME + ofs << i << " " << ms.count() << std::endl; +#endif /* SAVE_ITERTIME */ + } + + std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl; + + double itertime = time/wec.maxiter; + double gflops_s = 58*(maxrow*maxcol)/(1e6*itertime); + std::cout << "[Wave][OpenACC] GFlops/s: " << gflops_s << std::endl; + + size_t kernel_bytes = 3*sizeof(T)*(maxrow*maxcol); + double gbytes_s = kernel_bytes/(1e6*itertime); + std::cout << "[Wave][OpenACC] Bandwidth: " << gbytes_s << "GB/s" << std::endl; + + +#ifdef HAVE_SILO + visit_dump(wec.g_curr, "wave_openacc_lastiter.silo"); +#endif /* HAVE_SILO */ + + return time/wec.maxiter; } int main(void) @@ -20,8 +134,11 @@ int main(void) std::cout << "Precision: double" << std::endl; #endif - for (size_t sz = 128; sz <= 1024; sz *= 2) + double time; + + for (size_t sz = 256; sz <= 256; sz *= 2) { + std::cout << sz << std::endl; wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); wec.init(); @@ -29,4 +146,6 @@ int main(void) } return 0; -} \ No newline at end of file +} + + diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index 152c2b1..06f452c 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -14,6 +14,8 @@ #include <sstream> #include <string.h> +#include <pmmintrin.h> +#include <xmmintrin.h> #include "fd_grid.hpp" #include "fd_dataio.hpp" -- GitLab