diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index df923f68de2f7e889442673acecf1f90dddc88e6..82e3739f43b8a8c49f27fe321ce23cf35d0ec7ce 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -21,7 +21,19 @@ endif() find_package(OpenACC) if (OpenACC_CXX_FOUND) - add_definitions(-DHAVE_OPENACC) + + if (ENABLE_SINGLE) + add_executable(fd_openacc_single fd_openacc.cpp) + target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION) + target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp) + endif() + + if (ENABLE_DOUBLE) + add_executable(fd_openacc_double fd_openacc.cpp) + target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) + target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp) + endif() endif() option(ENABLE_TIMESTEP_OUTPUT "Save timesteps (don't use during perf meas)" OFF) diff --git a/kokkos-testing/fd_catalog/fd_dataio.hpp b/kokkos-testing/fd_catalog/fd_dataio.hpp index 2fe9efc31ef86761252a78521b94754021a7b22c..289c90757f475c4691fbab5f96f9109a0745e473 100644 --- a/kokkos-testing/fd_catalog/fd_dataio.hpp +++ b/kokkos-testing/fd_catalog/fd_dataio.hpp @@ -20,7 +20,7 @@ visit_dump(const fd_grid<T>& g, const std::string& fn) static_assert(std::is_same<T,double>::value or std::is_same<T,float>::value, "Only double or float"); DBfile *db = nullptr; - db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_HDF5); + db = DBCreate(fn.c_str(), DB_CLOBBER, DB_LOCAL, "visit output", DB_PDB); if (!db) { std::cout << "Cannot write simulation output" << std::endl; diff --git a/kokkos-testing/fd_catalog/fd_openacc.cpp b/kokkos-testing/fd_catalog/fd_openacc.cpp index 9a857a17916ac9cbf7231c0025d980095d2a52b8..ad4ffea78b2a8149b4be3a7445e7b5f83a0292e5 100644 --- a/kokkos-testing/fd_catalog/fd_openacc.cpp +++ b/kokkos-testing/fd_catalog/fd_openacc.cpp @@ -5,9 +5,123 @@ #include "fd_wave_cpu.hpp" +/* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */ + template<typename T> -double solve_openacc(wave_equation_context<T>& wec, size_t nths) +double solve_openacc(wave_equation_context<T>& wec) { + /* Simulation parameters */ + int maxcol = wec.g_curr.domain_cols(); + int maxrow = wec.g_curr.domain_rows(); + T dt = wec.dt; + T c = wec.velocity; + T a = wec.damping; + + assert(maxcol > 1); + assert(maxrow > 1); + + double time = 0.0; + +#ifdef SAVE_ITERTIME + std::ofstream ofs; + ofs.open("itertime-naive.txt"); +#endif /* SAVE_ITERTIME */ + + /**** Initialize constants ****/ + static const T w0 = -205.0/72.0; + static const T w1 = 8.0/5.0; + static const T w2 = -1.0/5.0; + static const T w3 = 8.0/315.0; + static const T w4 = -1.0/560.0; + static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; + + T * __restrict__ u_prev = wec.g_prev.data(); + T * __restrict__ u_curr = wec.g_curr.data(); + T * __restrict__ u_next = wec.g_next.data(); + size_t nelem = wec.g_curr.size(); + +#define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) ) + +//#pragma acc data copy(u_prev[0:nelem]) +//#pragma acc data copy(u_curr[0:nelem]) +//#pragma acc data copy(u_next[0:nelem]) + for (size_t iter = 0; iter < wec.maxiter; iter++) + { + auto start = std::chrono::high_resolution_clock::now(); + + T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); + T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T one_minus_adt = (1.0 - a*dt); + T two_minus_adt = (2.0 - a*dt); + +#pragma omp parallel for shared(maxrow, maxcol, u_prev, u_curr, u_next) +#pragma acc kernels + #pragma acc loop independent + for (size_t i = 0; i < maxrow; i++) + { + #pragma acc loop independent + for (size_t j = 0; j < maxcol; j++) + { + T lapl = 0.0; + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ]; + + T val = lapl - + one_minus_adt * u_prev[ U_OFFSET(i,j) ] + + two_minus_adt * u_curr[ U_OFFSET(i,j) ]; + + if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) + val = 0; + + u_next[ U_OFFSET(i,j) ] = val; + } + } + + auto stop = std::chrono::high_resolution_clock::now(); + + std::chrono::duration<double, std::milli> ms = stop - start; + time += ms.count(); + + std::swap(u_prev, u_curr); + std::swap(u_curr, u_next); + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (iter%100) == 0 ) + { + //#pragma acc update self(u_curr[0:nelem]) + std::stringstream ss; + ss << "wave_openacc_" << iter << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + +#ifdef SAVE_ITERTIME + ofs << i << " " << ms.count() << std::endl; +#endif /* SAVE_ITERTIME */ + } + + std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl; + + double itertime = time/wec.maxiter; + double gflops_s = 58*(maxrow*maxcol)/(1e6*itertime); + std::cout << "[Wave][OpenACC] GFlops/s: " << gflops_s << std::endl; + + size_t kernel_bytes = 3*sizeof(T)*(maxrow*maxcol); + double gbytes_s = kernel_bytes/(1e6*itertime); + std::cout << "[Wave][OpenACC] Bandwidth: " << gbytes_s << "GB/s" << std::endl; + + +#ifdef HAVE_SILO + visit_dump(wec.g_curr, "wave_openacc_lastiter.silo"); +#endif /* HAVE_SILO */ + + return time/wec.maxiter; } int main(void) @@ -20,8 +134,11 @@ int main(void) std::cout << "Precision: double" << std::endl; #endif - for (size_t sz = 128; sz <= 1024; sz *= 2) + double time; + + for (size_t sz = 256; sz <= 256; sz *= 2) { + std::cout << sz << std::endl; wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); wec.init(); @@ -29,4 +146,6 @@ int main(void) } return 0; -} \ No newline at end of file +} + + diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index 152c2b19ccfc7cc03e3bfb33fbddfa8b2a5f7b88..06f452c625a99d9577ac0ff5257b9d956f9615c5 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -14,6 +14,8 @@ #include <sstream> #include <string.h> +#include <pmmintrin.h> +#include <xmmintrin.h> #include "fd_grid.hpp" #include "fd_dataio.hpp"