From f1eb4ee47bb35ea25c1b69b6a91287fa4dd31ff0 Mon Sep 17 00:00:00 2001 From: Matteo Cicuttin <datafl4sh@toxicnet.eu> Date: Sat, 28 Mar 2020 19:41:47 +0100 Subject: [PATCH] Modified MT implementation. --- kokkos-testing/fd_catalog/CMakeLists.txt | 55 +++++++---- kokkos-testing/fd_catalog/fd_main.cpp | 14 +-- kokkos-testing/fd_catalog/fd_wave_cpu.hpp | 106 +++++++++++++++++++++- kokkos-testing/test_thread_overhead.cpp | 9 ++ 4 files changed, 152 insertions(+), 32 deletions(-) create mode 100644 kokkos-testing/test_thread_overhead.cpp diff --git a/kokkos-testing/fd_catalog/CMakeLists.txt b/kokkos-testing/fd_catalog/CMakeLists.txt index b92eb2e..1927a55 100644 --- a/kokkos-testing/fd_catalog/CMakeLists.txt +++ b/kokkos-testing/fd_catalog/CMakeLists.txt @@ -53,11 +53,44 @@ if (ENABLE_ITERTIME_OUTPUT) add_definitions(-DSAVE_ITERTIME) endif() -option(ENABLE_DAZ_FTZ "Enable Denormals Are Zero and Flush To Zero flags" ON) -if (ENABLE_DAZ_FTZ) +###################################################################### +## Optimization stuff + +option(OPT_PREFER_512bit "Prefer 512 bit vectors with AVX512 (Clang > 10 & GCC)" OFF) +if (OPT_PREFER_512bit) + # https://reviews.llvm.org/D67259 + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mprefer-vector-width=512") +endif() + +option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON) +if (OPT_AGGRESSIVE_FP) add_definitions(-DDISALLOW_DENORMALS) + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math") + endif() endif() +option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON) +if (ENABLE_VECTORIZER_REMARKS) + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized") + endif() + + if (CMAKE_CXX_COMPILER_ID STREQUAL "-PGI") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minfo") + endif() +endif() + +###################################################################### + option(ENABLE_KOKKOS "Enable Kokkos" OFF) if (ENABLE_KOKKOS) FetchContent_Declare(kokkos @@ -91,24 +124,6 @@ if (ENABLE_CUDA) endif() endif() -option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON) -if (ENABLE_VECTORIZER_REMARKS) - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") - endif() - - if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2") - endif() - - if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized") - endif() - - if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized") - endif() -endif() set(CMAKE_CXX_FLAGS_DEBUG "-g") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -g -DNDEBUG") diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp index f2d3de2..cbf749a 100644 --- a/kokkos-testing/fd_catalog/fd_main.cpp +++ b/kokkos-testing/fd_catalog/fd_main.cpp @@ -29,7 +29,7 @@ int main(int argc, char **argv) _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); std::cout << "Denormals: FTZ and DAZ" << std::endl; #endif - + _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID); //_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM); @@ -52,9 +52,9 @@ int main(int argc, char **argv) wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000); ofs << sz << " "; - //wec.init(); - //time = solve_sequential(wec); - //ofs << time << " "; + wec.init(); + time = solve_sequential(wec); + ofs << time << " "; wec.init(); time = solve_sequential_blocked(wec); @@ -68,9 +68,9 @@ int main(int argc, char **argv) for (size_t threads = 1; threads < maxthreads; threads *= 2) { - //wec.init(); - //time = solve_multithread(wec, threads); - //ofs << time << " "; + wec.init(); + time = solve_multithread_nopool(wec, threads); + ofs << time << " "; } ofs << std::endl; diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index 73f3d75..31b620f 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -44,7 +44,7 @@ operator<<(std::ostream& os, const tile& tl) #define BLK_ROWS 56 #define BLK_COLS 56 -//#define KERNEL_BLK_USE_MEMCPY +#define KERNEL_BLK_USE_MEMCPY #define KERNEL_BLK_USE_TEMP template<typename T> @@ -101,7 +101,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE); memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T)); #else - #pragma ivdep + //#pragma ivdep #pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++) { @@ -114,7 +114,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, #endif /* KERNEL_BLK_USE_TEMP */ for (size_t i = 0; i < BLK_ROWS; i++) { - #pragma ivdep + //#pragma ivdep //#pragma clang loop vectorize(assume_safety) for (size_t j = 0; j < BLK_COLS; j++) { @@ -337,6 +337,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, g_next(maxrow-1, j) = 0; } + #pragma loop distribute(enable) for (size_t i = 0; i < maxrow; i++) { g_next(i, 0) = 0; @@ -349,7 +350,7 @@ void wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, fd_grid<T>& g_next, const wave_2D_params<T>& params, - size_t thread_id = 0, size_t num_threads = 1) + size_t from = 0, size_t to = 0) { int maxrow = params.maxrow; int maxcol = params.maxcol; @@ -357,6 +358,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, T c = params.velocity; T a = params.damping; + if (to == 0) + to = maxrow; + assert(maxcol > 1); assert(maxrow > 1); @@ -373,8 +377,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, T one_minus_adt = (1.0 - a*dt); T two_minus_adt = (2.0 - a*dt); - for (size_t i = thread_id; i < maxrow; i += num_threads) + for (size_t i = from; i < to; i++) { + #pragma clang loop vectorize(enable) for (size_t j = 0; j < maxcol; j++) { T lapl = 0.0; @@ -614,3 +619,94 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths) return time/wec.maxiter; } +template<typename T> +double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths) +{ + /* Simulation parameters */ + wave_2D_params<T> params; + params.maxcol = wec.g_curr.domain_cols(); + params.maxrow = wec.g_curr.domain_rows(); + params.dt = wec.dt; + params.velocity = wec.velocity; + params.damping = wec.damping; + + + /* Multithreading stuff */ + std::mutex cv_mtx; + std::condition_variable prod_cv; + std::condition_variable cons_cv; + std::vector<bool> thread_done(nths); + bool iteration_finished = false; + + auto thread_lambda = [&](size_t thread_id, size_t num_threads) { +#ifdef DISALLOW_DENORMALS + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +#endif + + size_t count = params.maxrow / num_threads; + size_t remainder = params.maxrow % num_threads; + + size_t from = 0; + size_t to = 0; + + if (thread_id < remainder) + { + from = thread_id * (count + 1); + to = from + count; + } + else + { + from = thread_id * count + remainder; + to = from + (count - 1); + } + + wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to); + }; + + + double time = 0.0; + + for (size_t iter = 0; iter < wec.maxiter; iter++) + { + auto start = std::chrono::high_resolution_clock::now(); + + std::vector<std::thread> threads(nths); + for (size_t i = 0; i < nths; i++) + threads[i] = std::thread(thread_lambda, i, nths); + + for (auto& th : threads) + th.join(); + + auto stop = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double, std::milli> ms = stop - start; + time += ms.count(); + + std::swap(wec.g_prev, wec.g_curr); + std::swap(wec.g_curr, wec.g_next); + +#ifdef HAVE_SILO +#ifdef SAVE_TIMESTEPS + if ( (iter%100) == 0 ) + { + std::stringstream ss; + ss << "wave_mt_" << iter << ".silo"; + visit_dump(wec.g_curr, ss.str()); + } +#endif /* SAVE_TIMESTEPS */ +#endif /* HAVE_SILO */ + } + + std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): "; + std::cout << time/wec.maxiter << "ms" << std::endl; + std::cout << "[Wave][MT] Wall Time (" << nths << " threads): "; + std::cout << time << "ms" << std::endl; + +#ifdef HAVE_SILO + visit_dump(wec.g_curr, "wave_mt_lastiter.silo"); +#endif /* HAVE_SILO */ + + return time/wec.maxiter; +} + + diff --git a/kokkos-testing/test_thread_overhead.cpp b/kokkos-testing/test_thread_overhead.cpp new file mode 100644 index 0000000..291ed57 --- /dev/null +++ b/kokkos-testing/test_thread_overhead.cpp @@ -0,0 +1,9 @@ +#include <thread> + +int main(int argc, char** argv) +{ + for (volatile int i = 0; i < 500000; i++) + std::thread([](){}).detach(); + return 0; +} + -- GitLab