From d965a1d06bf4c4d09b5a6f865e716a16aafd6454 Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Fri, 27 Mar 2020 19:44:14 +0100 Subject: [PATCH] Updated experimental data. --- .../AllTimes_double_update.plot | 19 ++ .../AllTimes_double_update.txt | 5 + .../AllTimes_float_update.plot | 20 +++ .../AllTimes_float_update.txt | 6 + .../CUDA_gflops_double_update.txt | 6 + .../i7-3615QM_gflops_double_update.plot | 14 ++ kokkos-testing/fd_catalog/CMakeLists.txt | 24 ++- kokkos-testing/fd_catalog/fd_main.cpp | 15 +- kokkos-testing/fd_catalog/fd_wave_cpu.hpp | 168 ++++++++++++++---- kokkos-testing/test_mandel.cpp | 13 +- 10 files changed, 243 insertions(+), 47 deletions(-) create mode 100644 kokkos-testing/experimental_data/AllTimes_double_update.plot create mode 100644 kokkos-testing/experimental_data/AllTimes_double_update.txt create mode 100644 kokkos-testing/experimental_data/AllTimes_float_update.plot create mode 100644 kokkos-testing/experimental_data/AllTimes_float_update.txt create mode 100644 kokkos-testing/experimental_data/CUDA_gflops_double_update.txt create mode 100644 kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot diff --git a/kokkos-testing/experimental_data/AllTimes_double_update.plot b/kokkos-testing/experimental_data/AllTimes_double_update.plot new file mode 100644 index 0000000..fe76eb7 --- /dev/null +++ b/kokkos-testing/experimental_data/AllTimes_double_update.plot @@ -0,0 +1,19 @@ +set term postscript enhanced color +set output 'AllTimes_double_update.eps' + +set key left top + +set title '2D damped wave equation - double' + +set xlabel 'Mesh size' +set ylabel 'Iteration time [ms]' + +set style data histogram +set style fill solid border -1 +set logscale y +set grid ytics mytics + +set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red' + +plot 'AllTimes_double_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \ + '' u 4 ti col, '' u 5 ti col, '' u 6 ti col diff --git a/kokkos-testing/experimental_data/AllTimes_double_update.txt b/kokkos-testing/experimental_data/AllTimes_double_update.txt new file mode 100644 index 0000000..914a7e9 --- /dev/null +++ b/kokkos-testing/experimental_data/AllTimes_double_update.txt @@ -0,0 +1,5 @@ +"SIZE" "Seq" "SeqBlk" "1T" "2T" "4T" +128 0.215397 0.0721867 0.231328 0.146418 0.150125 +256 0.884406 0.284522 0.996983 0.519418 0.420706 +512 3.49626 1.34328 3.55165 1.84903 1.3564 +1024 13.9678 5.97425 14.4531 7.56775 5.25945 diff --git a/kokkos-testing/experimental_data/AllTimes_float_update.plot b/kokkos-testing/experimental_data/AllTimes_float_update.plot new file mode 100644 index 0000000..9a4b0dc --- /dev/null +++ b/kokkos-testing/experimental_data/AllTimes_float_update.plot @@ -0,0 +1,20 @@ +set term postscript enhanced color +set output 'AllTimes_float_update.eps' + +set key left top + +set title '2D damped wave equation - float' + +set xlabel 'Mesh size' +set ylabel 'Iteration time [ms]' + +set style data histogram +set style fill solid border -1 +set logscale y +set grid ytics mytics + +set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red' + +plot 'AllTimes_float_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \ + '' u 4 ti col, '' u 5 ti col, '' u 6 ti col + diff --git a/kokkos-testing/experimental_data/AllTimes_float_update.txt b/kokkos-testing/experimental_data/AllTimes_float_update.txt new file mode 100644 index 0000000..77e1555 --- /dev/null +++ b/kokkos-testing/experimental_data/AllTimes_float_update.txt @@ -0,0 +1,6 @@ +"SIZE" "Seq" "SeqBlk" "1T" "2T" "4T" +128 0.285575 0.0328261 0.266828 0.159186 0.1409 +256 1.15701 0.13435 1.07333 0.531615 0.387834 +512 4.65854 0.57531 4.21764 2.03206 1.42824 +1024 13.9774 3.15193 16.5969 8.72612 5.83051 + diff --git a/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt b/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt new file mode 100644 index 0000000..e9da9dc --- /dev/null +++ b/kokkos-testing/experimental_data/CUDA_gflops_double_update.txt @@ -0,0 +1,6 @@ +"Size" "GT650M" "GTX760" "i7-3615QM" +128 13.062 45.9258 13.1641 +256 16.8322 62.7176 13.3595 +512 17.469 67.3603 11.3188 +1024 17.0132 68.6386 10.1799 + diff --git a/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot b/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot new file mode 100644 index 0000000..69ac71c --- /dev/null +++ b/kokkos-testing/experimental_data/i7-3615QM_gflops_double_update.plot @@ -0,0 +1,14 @@ +set term postscript enhanced color font ',20' +set output 'i7-3615QM_gflops_double_update.eps' + +set title "i7-3615QM SeqBlk double" +set logscale x +set yrange [3:20] +set xlabel "Mesh size" +set ylabel "GFlops/s" + +set label "Intel declared: 18.4 GFlops/s" at 150,18.8 +set label "DGEMM: 3.83 GFlops/s" at 150,4.2 +set arrow 1 from 128,18.4 to 1024,18.4 nohead dt '.' +set arrow 2 from 128,3.83 to 1024,3.83 nohead dt '.' +plot 'CUDA_gflops_double_update.txt' using 1:4:xtic(1) w lp lw 2 ti 'SeqBlk GFlops/s' diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index 82e3739..b92eb2e 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -21,18 +21,22 @@ endif() find_package(OpenACC) if (OpenACC_CXX_FOUND) + set(OPENACC_LINK_LIBS ${LINK_LIBS}) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp) + endif() if (ENABLE_SINGLE) add_executable(fd_openacc_single fd_openacc.cpp) target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION) target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS}) - target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp) + target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS}) endif() if (ENABLE_DOUBLE) add_executable(fd_openacc_double fd_openacc.cpp) target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS}) - target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp) + target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS}) endif() endif() @@ -87,10 +91,22 @@ if (ENABLE_CUDA) endif() endif() -option(ENABLE_VECTORIZER_REMARKS "Enable Clang vectorizer remarks" ON) +option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON) if (ENABLE_VECTORIZER_REMARKS) if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized") endif() endif() diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp index e0d50d5..f2d3de2 100644 --- a/kokkos-testing/fd_catalog/fd_main.cpp +++ b/kokkos-testing/fd_catalog/fd_main.cpp @@ -29,8 +29,9 @@ int main(int argc, char **argv) _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); std::cout << "Denormals: FTZ and DAZ" << std::endl; #endif + _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); - + //_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM); /* Make header */ ofs << "\"SIZE\" \"Seq\" \"SeqBlk\" "; @@ -51,9 +52,9 @@ int main(int argc, char **argv) wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); ofs << sz << " "; - wec.init(); - time = solve_sequential(wec); - ofs << time << " "; + //wec.init(); + //time = solve_sequential(wec); + //ofs << time << " "; wec.init(); time = solve_sequential_blocked(wec); @@ -67,9 +68,9 @@ int main(int argc, char **argv) for (size_t threads = 1; threads < maxthreads; threads *= 2) { - wec.init(); - time = solve_multithread(wec, threads); - ofs << time << " "; + //wec.init(); + //time = solve_multithread(wec, threads); + //ofs << time << " "; } ofs << std::endl; diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index 06f452c..73f3d75 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -44,6 +44,8 @@ operator<<(std::ostream& os, const tile& tl) #define BLK_ROWS 56 #define BLK_COLS 56 +//#define KERNEL_BLK_USE_MEMCPY +#define KERNEL_BLK_USE_TEMP template<typename T> void @@ -51,9 +53,6 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, fd_grid<T>& g_next, const wave_2D_params<T>& params) { - /* 1) memcpy is faster than for - * 2) if you put inner block processing, you kill perf - */ int maxrow = params.maxrow; int maxcol = params.maxcol; T dt = params.dt; @@ -71,10 +70,12 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, static const T w4 = -1.0/560.0; static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; +#ifdef KERNEL_BLK_USE_TEMP T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE]; +#endif - T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); - T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1)); + T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1)); T one_minus_adt = (1.0 - a*dt); T two_minus_adt = (2.0 - a*dt); @@ -91,19 +92,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, { /* Here all the dimensions are known at compile time, let the * compiler do its job. */ +#ifdef KERNEL_BLK_USE_TEMP for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++) { +#ifdef KERNEL_BLK_USE_MEMCPY int ofs_x = bj - WAVE_8_HALO_SIZE; int ofs_y = bi + i - WAVE_8_HALO_SIZE; assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); +#else + #pragma ivdep + #pragma clang loop vectorize(assume_safety) + for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++) + { + int ofs_x = bj + j - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + s_curr[i][j] = g_curr(ofs_y, ofs_x); + } +#endif /* KERNEL_BLK_USE_MEMCPY */ } - +#endif /* KERNEL_BLK_USE_TEMP */ for (size_t i = 0; i < BLK_ROWS; i++) { + #pragma ivdep + //#pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < BLK_COLS; j++) { T lapl = 0.0; +#ifdef KERNEL_BLK_USE_TEMP for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; @@ -111,8 +127,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; T val = lapl - - one_minus_adt * g_prev(bi+i, bj+j) + - two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; +#else + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j); + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * g_curr(bi+i, bj+j); +#endif /* KERNEL_BLK_USE_TEMP */ assert(j < BLK_COLS); g_next(bi+i, bj+j) = val; @@ -124,22 +151,37 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, /* Do upper blocks */ { size_t bi = blocks_y*BLK_ROWS; + for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS) { - for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++) +#ifdef KERNEL_BLK_USE_TEMP + for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++) { - +#ifdef KERNEL_BLK_USE_MEMCPY int ofs_x = bj - WAVE_8_HALO_SIZE; int ofs_y = bi + i - WAVE_8_HALO_SIZE; assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); +#else + #pragma clang loop vectorize(assume_safety) + for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++) + { + int ofs_x = bj + j - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + s_curr[i][j] = g_curr(ofs_y, ofs_x); + } +#endif } +#endif /* KERNEL_BLK_USE_TEMP */ for (size_t i = 0; i < rem_y; i++) { + #pragma ivdep + //#pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < BLK_COLS; j++) { T lapl = 0.0; +#ifdef KERNEL_BLK_USE_TEMP for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; @@ -147,8 +189,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; T val = lapl - - one_minus_adt * g_prev(bi+i, bj+j) + - two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; +#else + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j); + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * g_curr(bi+i, bj+j); +#endif /* KERNEL_BLK_USE_TEMP */ assert(j < BLK_COLS); g_next(bi+i, bj+j) = val; @@ -162,29 +215,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, size_t bi = blocks_y*BLK_ROWS; size_t bj = blocks_x*BLK_COLS; - for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++) +#ifdef KERNEL_BLK_USE_TEMP + for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++) { - +#ifdef KERNEL_BLK_USE_MEMCPY int ofs_x = bj - WAVE_8_HALO_SIZE; int ofs_y = bi + i - WAVE_8_HALO_SIZE; assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); - memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T)); +#else + #pragma clang loop vectorize(assume_safety) + for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++) + { + int ofs_x = bj + j - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + s_curr[i][j] = g_curr(ofs_y, ofs_x); + } +#endif } +#endif /* KERNEL_BLK_USE_TEMP */ for (size_t i = 0; i < rem_y; i++) { + #pragma ivdep + //#pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < rem_x; j++) { T lapl = 0.0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; - - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; +#ifdef KERNEL_BLK_USE_TEMP + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; - T val = lapl - + T val = lapl - one_minus_adt * g_prev(bi+i, bj+j) + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; +#else + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j); + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * g_curr(bi+i, bj+j); +#endif /* KERNEL_BLK_USE_TEMP */ assert(j < BLK_COLS); g_next(bi+i, bj+j) = val; @@ -197,29 +275,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, { size_t bj = blocks_x*BLK_COLS; +#ifdef KERNEL_BLK_USE_TEMP for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++) { - +#ifdef KERNEL_BLK_USE_MEMCPY int ofs_x = bj - WAVE_8_HALO_SIZE; int ofs_y = bi + i - WAVE_8_HALO_SIZE; assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); - memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T)); + memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T)); +#else + #pragma clang loop vectorize(assume_safety) + for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++) + { + int ofs_x = bj + j - WAVE_8_HALO_SIZE; + int ofs_y = bi + i - WAVE_8_HALO_SIZE; + s_curr[i][j] = g_curr(ofs_y, ofs_x); + } +#endif } +#endif /* KERNEL_BLK_USE_TEMP */ for (size_t i = 0; i < BLK_ROWS; i++) { + #pragma ivdep + //#pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < rem_x; j++) { T lapl = 0.0; - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; - - for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) - lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; +#ifdef KERNEL_BLK_USE_TEMP + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k]; + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE]; - T val = lapl - + T val = lapl - one_minus_adt * g_prev(bi+i, bj+j) + two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE]; +#else + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k); + + for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) + lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j); + + T val = lapl - + one_minus_adt * g_prev(bi+i, bj+j) + + two_minus_adt * g_curr(bi+i, bj+j); +#endif /* KERNEL_BLK_USE_TEMP */ assert(j < BLK_COLS); g_next(bi+i, bj+j) = val; @@ -265,8 +368,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, static const T w4 = -1.0/560.0; static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 }; - T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1); - T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1); + T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1)); + T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1)); T one_minus_adt = (1.0 - a*dt); T two_minus_adt = (2.0 - a*dt); @@ -281,7 +384,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++) lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j); - T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j); + T val = lapl - one_minus_adt * g_prev(i, j); + val += two_minus_adt * g_curr(i, j); if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) ) val = 0; diff --git a/kokkos-testing/test_mandel.cpp b/kokkos-testing/test_mandel.cpp index 6365842..832b340 100644 --- a/kokkos-testing/test_mandel.cpp +++ b/kokkos-testing/test_mandel.cpp @@ -3,6 +3,9 @@ #include <vector> #include <chrono> +#include <pmmintrin.h> +#include <xmmintrin.h> + #define YMIN (-1.5) #define YMAX (1.5) #define XMIN (-2.5) @@ -56,8 +59,6 @@ void gpu_mandel(T *mandel, int maxiter) mandel[i*SIZE_X + j] = iter; } -#endif /* __NVCC__ */ - template<typename T> void compute_gpu(std::vector<T>& mandel, int maxiter) { @@ -65,10 +66,12 @@ void compute_gpu(std::vector<T>& mandel, int maxiter) dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS); } +#endif /* __NVCC__ */ + int main(void) { size_t maxiter = 1000; - using T = double; + using T = float; std::vector<T> mandel(SIZE_X*SIZE_Y); @@ -76,7 +79,9 @@ int main(void) size_t flops = 0; - #pragma omp parallel for + //_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + //_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + for (size_t i = 0; i < SIZE_Y; i++) { for (size_t j = 0; j < SIZE_X; j++) -- GitLab