Modified MT implementation.

f1eb4ee4 · Matteo Cicuttin · d965a1d0 · f1eb4ee4 · f1eb4ee4 · f1eb4ee4
Commit f1eb4ee4 authored 5 years ago by Matteo Cicuttin
--- a/kokkos-testing/fd_catalog/CMakeLists.txt
+++ b/kokkos-testing/fd_catalog/CMakeLists.txt
@@ -53,11 +53,44 @@ if (ENABLE_ITERTIME_OUTPUT)
    add_definitions(-DSAVE_ITERTIME)
 endif()

-option(ENABLE_DAZ_FTZ "Enable Denormals Are Zero and Flush To Zero flags" ON)
-if (ENABLE_DAZ_FTZ)
+######################################################################
+## Optimization stuff
+
+option(OPT_PREFER_512bit "Prefer 512 bit vectors with AVX512 (Clang > 10 & GCC)" OFF)
+if (OPT_PREFER_512bit)
+    # https://reviews.llvm.org/D67259
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mprefer-vector-width=512")
+endif()
+
+option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
+if (OPT_AGGRESSIVE_FP)
    add_definitions(-DDISALLOW_DENORMALS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
+    endif()
+endif()
+
+option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
+if (ENABLE_VECTORIZER_REMARKS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
    endif()

+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
+    endif()
+
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "-PGI")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minfo")
+    endif()
+endif()
+
+######################################################################
+
 option(ENABLE_KOKKOS "Enable Kokkos" OFF)
 if (ENABLE_KOKKOS)
    FetchContent_Declare(kokkos
@@ -91,24 +124,6 @@ if (ENABLE_CUDA)
    endif()
 endif()

-option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
-if (ENABLE_VECTORIZER_REMARKS)
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
-    endif()
-
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
-    endif()
-endif()

 set(CMAKE_CXX_FLAGS_DEBUG "-g")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -g -DNDEBUG")

--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -52,9 +52,9 @@ int main(int argc, char **argv)
        wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
        ofs << sz << "    ";

-        //wec.init();
-        //time = solve_sequential(wec);
-        //ofs << time << "    ";
+        wec.init();
+        time = solve_sequential(wec);
+        ofs << time << "    ";

        wec.init();
        time = solve_sequential_blocked(wec);
@@ -68,9 +68,9 @@ int main(int argc, char **argv)

        for (size_t threads = 1; threads < maxthreads; threads *= 2)
        {
-            //wec.init();
-            //time = solve_multithread(wec, threads);
-            //ofs << time << "    ";
+            wec.init();
+            time = solve_multithread_nopool(wec, threads);
+            ofs << time << "    ";
        }

        ofs << std::endl;

--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -44,7 +44,7 @@ operator<<(std::ostream& os, const tile& tl)

 #define BLK_ROWS 56
 #define BLK_COLS 56
-//#define KERNEL_BLK_USE_MEMCPY
+#define KERNEL_BLK_USE_MEMCPY
 #define KERNEL_BLK_USE_TEMP

 template<typename T>
@@ -101,7 +101,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
 #else
-                #pragma ivdep
+                //#pragma ivdep
                #pragma clang loop vectorize(assume_safety)
                for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
                {
@@ -114,7 +114,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
 #endif /* KERNEL_BLK_USE_TEMP */
            for (size_t i = 0; i < BLK_ROWS; i++)
            {
-                #pragma ivdep
+                //#pragma ivdep
                //#pragma clang loop vectorize(assume_safety)
                for (size_t j = 0; j < BLK_COLS; j++)
                {
@@ -337,6 +337,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
        g_next(maxrow-1, j) = 0;
    }

+    #pragma loop distribute(enable)
    for (size_t i = 0; i < maxrow; i++)
    {
        g_next(i, 0) = 0;
@@ -349,7 +350,7 @@ void
 wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
               fd_grid<T>& g_next,
               const wave_2D_params<T>& params,
-               size_t thread_id = 0, size_t num_threads = 1)
+               size_t from = 0, size_t to = 0)
 {
    int maxrow  = params.maxrow;
    int maxcol  = params.maxcol;
@@ -357,6 +358,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    T   c       = params.velocity;
    T   a       = params.damping;

+    if (to == 0)
+        to = maxrow;
+
    assert(maxcol > 1);
    assert(maxrow > 1);

@@ -373,8 +377,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

-    for (size_t i = thread_id; i < maxrow; i += num_threads)
+    for (size_t i = from; i < to; i++)
    {
+        #pragma clang loop vectorize(enable)
        for (size_t j = 0; j < maxcol; j++)
        {
            T lapl = 0.0;
@@ -614,3 +619,94 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
    return time/wec.maxiter;
 }

+template<typename T>
+double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
+{
+    /* Simulation parameters */
+    wave_2D_params<T> params;
+    params.maxcol = wec.g_curr.domain_cols();
+    params.maxrow = wec.g_curr.domain_rows();
+    params.dt = wec.dt;
+    params.velocity = wec.velocity;
+    params.damping = wec.damping;
+
+
+    /* Multithreading stuff */
+    std::mutex                  cv_mtx;
+    std::condition_variable     prod_cv;
+    std::condition_variable     cons_cv;
+    std::vector<bool>           thread_done(nths);
+    bool                        iteration_finished = false;
+
+    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
+#ifdef DISALLOW_DENORMALS
+        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+
+        size_t count        = params.maxrow / num_threads;
+        size_t remainder    = params.maxrow % num_threads;
+
+        size_t from = 0;
+        size_t to   = 0;
+
+        if (thread_id < remainder)
+        {
+            from    = thread_id * (count + 1);
+            to      = from + count;
+        }
+        else
+        {
+            from = thread_id * count + remainder;
+            to   = from + (count - 1);
+        }
+
+        wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to);
+    };
+
+
+    double time = 0.0;
+
+    for (size_t iter = 0; iter < wec.maxiter; iter++)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+
+        std::vector<std::thread> threads(nths);
+        for (size_t i = 0; i < nths; i++)
+            threads[i] = std::thread(thread_lambda, i, nths);
+
+        for (auto& th : threads)
+            th.join();
+
+        auto stop = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double, std::milli> ms = stop - start;
+        time += ms.count();
+
+        std::swap(wec.g_prev, wec.g_curr);
+        std::swap(wec.g_curr, wec.g_next);
+
+#ifdef HAVE_SILO
+#ifdef SAVE_TIMESTEPS
+        if ( (iter%100) == 0 )
+        {
+            std::stringstream ss;
+            ss << "wave_mt_" << iter << ".silo";
+            visit_dump(wec.g_curr, ss.str());
+        }
+#endif /* SAVE_TIMESTEPS */
+#endif /* HAVE_SILO */
+    }
+    
+    std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
+    std::cout << time/wec.maxiter << "ms" << std::endl;
+    std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
+    std::cout << time << "ms" << std::endl;
+
+#ifdef HAVE_SILO
+    visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
+#endif /* HAVE_SILO */
+
+    return time/wec.maxiter;
+}
+
+
--- a/kokkos-testing/test_thread_overhead.cpp
+++ b/kokkos-testing/test_thread_overhead.cpp
+#include <thread>
+
+int main(int argc, char** argv)
+{
+  for (volatile int i = 0; i < 500000; i++)
+    std::thread([](){}).detach();
+  return 0;
+}
+