Testing thread synchronization overhead.

5e156636 · Matteo Cicuttin · f1eb4ee4 · 5e156636 · 5e156636 · 5e156636
Commit 5e156636 authored Mar 29, 2020 by Matteo Cicuttin
--- a/kokkos-testing/fd_catalog/fd_main.cpp
+++ b/kokkos-testing/fd_catalog/fd_main.cpp
@@ -65,11 +65,10 @@ int main(int argc, char **argv)
        time = solve_cuda(wec);
        ofs << time << "    ";
 #endif
-
        for (size_t threads = 1; threads < maxthreads; threads *= 2)
        {
            wec.init();
-            time = solve_multithread_nopool(wec, threads);
+            time = solve_multithread(wec, threads);
            ofs << time << "    ";
        }


--- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
+++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp
@@ -12,6 +12,7 @@
 #include <cmath>
 #include <algorithm>
 #include <sstream>
+#include <atomic>

 #include <string.h>
 #include <pmmintrin.h>
@@ -350,7 +351,7 @@ void
 wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
               fd_grid<T>& g_next,
               const wave_2D_params<T>& params,
-               size_t from = 0, size_t to = 0)
+               size_t from = 0, size_t to = 1)
 {
    int maxrow  = params.maxrow;
    int maxcol  = params.maxcol;
@@ -358,12 +359,12 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    T   c       = params.velocity;
    T   a       = params.damping;

-    if (to == 0)
-        to = maxrow;
-
    assert(maxcol > 1);
    assert(maxrow > 1);

+    //if (to == 0)
+    //    to = maxrow-1;
+
    /**** Initialize constants ****/
    static const T w0 = -205.0/72.0;
    static const T w1 = 8.0/5.0;
@@ -377,7 +378,7 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

-    for (size_t i = from; i < to; i++)
+    for (size_t i = from; i < maxrow; i+=to)
    {
        #pragma clang loop vectorize(enable)
        for (size_t j = 0; j < maxcol; j++)
@@ -509,6 +510,18 @@ double solve_sequential_blocked(wave_equation_context<T>& wec)
    return solve_sequential_aux<T,true>(wec);
 }

+class SpinLock {
+    std::atomic_flag locked = ATOMIC_FLAG_INIT ;
+public:
+    void lock() {
+        while (locked.test_and_set(std::memory_order_acquire)) { ; }
+    }
+    void unlock() {
+        locked.clear(std::memory_order_release);
+    }
+};
+
+
 template<typename T>
 double solve_multithread(wave_equation_context<T>& wec, size_t nths)
 {
@@ -526,6 +539,7 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
    std::condition_variable     prod_cv;
    std::condition_variable     cons_cv;
    std::vector<bool>           thread_done(nths);
+    std::vector<double>         times(nths);
    bool                        iteration_finished = false;

    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
@@ -546,12 +560,16 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
            }

            /* Do the timestep */
+            auto start = std::chrono::high_resolution_clock::now();
            wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);
+            auto stop = std::chrono::high_resolution_clock::now();
+            std::chrono::duration<double, std::milli> ms = stop - start;

            /* Work for this thread finished, notify producer */
            std::unique_lock<std::mutex> lck(cv_mtx);
            prod_cv.notify_one();
            thread_done[thread_id] = true;
+            times[thread_id] += ms.count();
        }
    };

@@ -607,11 +625,23 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
    for (auto& th : threads)
        th.join();
    
+    double itertime = time/wec.maxiter;
+
    std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
-    std::cout << time/wec.maxiter << "ms" << std::endl;
+    std::cout << itertime << "ms" << std::endl;
    std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
    std::cout << time << "ms" << std::endl;

+    for (auto& t : times)
+    {
+        double t_itertime = t/wec.maxiter;
+        double t_overhead = (time - t)/wec.maxiter;
+        double t_overhead_percent = 100.0*t_overhead/itertime;
+        std::cout << "  Thread time: " << t_itertime;
+        std::cout << " overhead: " << t_overhead << " (";
+        std::cout << t_overhead_percent << "%)" << std::endl;
+    }
+
 #ifdef HAVE_SILO
    visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
 #endif /* HAVE_SILO */
@@ -639,6 +669,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
    bool                        iteration_finished = false;

    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
+        /*
 #ifdef DISALLOW_DENORMALS
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
@@ -662,6 +693,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
        }

        wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to);
+        */
    };


@@ -675,8 +707,8 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
        for (size_t i = 0; i < nths; i++)
            threads[i] = std::thread(thread_lambda, i, nths);

-        for (auto& th : threads)
-            th.join();
+        for (size_t i = 0; i < nths; i++)
+            threads[i].join();

        auto stop = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> ms = stop - start;

--- a/kokkos-testing/test_sumbw.cpp
+++ b/kokkos-testing/test_sumbw.cpp
@@ -54,8 +54,8 @@ int main(void)
        for (size_t iter = 0; iter < maxiter; iter++)
        {
            auto start = std::chrono::high_resolution_clock::now();
-            sum_restrict(prev.data(), curr.data(), next.data(), sz*sz);
-            //memcpy(next.data(), curr.data(), sz*sz*sizeof(T));
+            //sum_restrict(prev.data(), curr.data(), next.data(), sz*sz);
+            memcpy(next.data(), curr.data(), sz*sz*sizeof(T));
            std::swap(prev, curr);
            std::swap(curr, next);
            auto stop = std::chrono::high_resolution_clock::now();
@@ -65,7 +65,7 @@ int main(void)
        }

        auto time = std::accumulate(itertime.begin(), itertime.end(), 0.0) / maxiter;
-        std::cout << "Sum bandwidth: " <<  3*sizeof(T)*sz*sz/(1e6*time);
+        std::cout << "Sum bandwidth: " <<  2*sizeof(T)*sz*sz/(1e6*time);
        std::cout << " GB/s"  << std::endl;

        itertimes.push_back( std::move(itertime) );