diff --git a/kokkos-testing/fd_catalog/fd_main.cpp b/kokkos-testing/fd_catalog/fd_main.cpp index cbf749ab2255c19b5774a6afb6e9c01a8cad5db9..0f1a8c1a6f38952dcacb28be665f6e303553e5d2 100644 --- a/kokkos-testing/fd_catalog/fd_main.cpp +++ b/kokkos-testing/fd_catalog/fd_main.cpp @@ -65,11 +65,10 @@ int main(int argc, char **argv) time = solve_cuda(wec); ofs << time << " "; #endif - for (size_t threads = 1; threads < maxthreads; threads *= 2) { wec.init(); - time = solve_multithread_nopool(wec, threads); + time = solve_multithread(wec, threads); ofs << time << " "; } diff --git a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp index 31b620ff97ab7f96901832de15440395e1ec5f2a..e82cebff0ea61d70072b74f672a99159c4325636 100644 --- a/kokkos-testing/fd_catalog/fd_wave_cpu.hpp +++ b/kokkos-testing/fd_catalog/fd_wave_cpu.hpp @@ -12,6 +12,7 @@ #include <cmath> #include <algorithm> #include <sstream> +#include <atomic> #include <string.h> #include <pmmintrin.h> @@ -350,7 +351,7 @@ void wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, fd_grid<T>& g_next, const wave_2D_params<T>& params, - size_t from = 0, size_t to = 0) + size_t from = 0, size_t to = 1) { int maxrow = params.maxrow; int maxcol = params.maxcol; @@ -358,12 +359,12 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, T c = params.velocity; T a = params.damping; - if (to == 0) - to = maxrow; - assert(maxcol > 1); assert(maxrow > 1); + //if (to == 0) + // to = maxrow-1; + /**** Initialize constants ****/ static const T w0 = -205.0/72.0; static const T w1 = 8.0/5.0; @@ -377,7 +378,7 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr, T one_minus_adt = (1.0 - a*dt); T two_minus_adt = (2.0 - a*dt); - for (size_t i = from; i < to; i++) + for (size_t i = from; i < maxrow; i+=to) { #pragma clang loop vectorize(enable) for (size_t j = 0; j < maxcol; j++) @@ -509,6 +510,18 @@ double solve_sequential_blocked(wave_equation_context<T>& wec) return solve_sequential_aux<T,true>(wec); } +class SpinLock { + std::atomic_flag locked = ATOMIC_FLAG_INIT ; +public: + void lock() { + while (locked.test_and_set(std::memory_order_acquire)) { ; } + } + void unlock() { + locked.clear(std::memory_order_release); + } +}; + + template<typename T> double solve_multithread(wave_equation_context<T>& wec, size_t nths) { @@ -526,6 +539,7 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths) std::condition_variable prod_cv; std::condition_variable cons_cv; std::vector<bool> thread_done(nths); + std::vector<double> times(nths); bool iteration_finished = false; auto thread_lambda = [&](size_t thread_id, size_t num_threads) { @@ -546,12 +560,16 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths) } /* Do the timestep */ + auto start = std::chrono::high_resolution_clock::now(); wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads); + auto stop = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double, std::milli> ms = stop - start; /* Work for this thread finished, notify producer */ std::unique_lock<std::mutex> lck(cv_mtx); prod_cv.notify_one(); thread_done[thread_id] = true; + times[thread_id] += ms.count(); } }; @@ -607,11 +625,23 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths) for (auto& th : threads) th.join(); + double itertime = time/wec.maxiter; + std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): "; - std::cout << time/wec.maxiter << "ms" << std::endl; + std::cout << itertime << "ms" << std::endl; std::cout << "[Wave][MT] Wall Time (" << nths << " threads): "; std::cout << time << "ms" << std::endl; + for (auto& t : times) + { + double t_itertime = t/wec.maxiter; + double t_overhead = (time - t)/wec.maxiter; + double t_overhead_percent = 100.0*t_overhead/itertime; + std::cout << " Thread time: " << t_itertime; + std::cout << " overhead: " << t_overhead << " ("; + std::cout << t_overhead_percent << "%)" << std::endl; + } + #ifdef HAVE_SILO visit_dump(wec.g_curr, "wave_mt_lastiter.silo"); #endif /* HAVE_SILO */ @@ -639,6 +669,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths) bool iteration_finished = false; auto thread_lambda = [&](size_t thread_id, size_t num_threads) { + /* #ifdef DISALLOW_DENORMALS _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); @@ -662,6 +693,7 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths) } wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to); + */ }; @@ -675,8 +707,8 @@ double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths) for (size_t i = 0; i < nths; i++) threads[i] = std::thread(thread_lambda, i, nths); - for (auto& th : threads) - th.join(); + for (size_t i = 0; i < nths; i++) + threads[i].join(); auto stop = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> ms = stop - start; diff --git a/kokkos-testing/test_sumbw.cpp b/kokkos-testing/test_sumbw.cpp index 4ee808e32961ed9631dc5fec2d1ae1c969106406..d1271a71a473c2171b16252a575cf28dd4e8b293 100644 --- a/kokkos-testing/test_sumbw.cpp +++ b/kokkos-testing/test_sumbw.cpp @@ -54,8 +54,8 @@ int main(void) for (size_t iter = 0; iter < maxiter; iter++) { auto start = std::chrono::high_resolution_clock::now(); - sum_restrict(prev.data(), curr.data(), next.data(), sz*sz); - //memcpy(next.data(), curr.data(), sz*sz*sizeof(T)); + //sum_restrict(prev.data(), curr.data(), next.data(), sz*sz); + memcpy(next.data(), curr.data(), sz*sz*sizeof(T)); std::swap(prev, curr); std::swap(curr, next); auto stop = std::chrono::high_resolution_clock::now(); @@ -65,7 +65,7 @@ int main(void) } auto time = std::accumulate(itertime.begin(), itertime.end(), 0.0) / maxiter; - std::cout << "Sum bandwidth: " << 3*sizeof(T)*sz*sz/(1e6*time); + std::cout << "Sum bandwidth: " << 2*sizeof(T)*sz*sz/(1e6*time); std::cout << " GB/s" << std::endl; itertimes.push_back( std::move(itertime) );