Skip to content
Snippets Groups Projects
Commit f1eb4ee4 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Modified MT implementation.

parent d965a1d0
No related branches found
No related tags found
No related merge requests found
......@@ -53,11 +53,44 @@ if (ENABLE_ITERTIME_OUTPUT)
add_definitions(-DSAVE_ITERTIME)
endif()
option(ENABLE_DAZ_FTZ "Enable Denormals Are Zero and Flush To Zero flags" ON)
if (ENABLE_DAZ_FTZ)
######################################################################
## Optimization stuff
option(OPT_PREFER_512bit "Prefer 512 bit vectors with AVX512 (Clang > 10 & GCC)" OFF)
if (OPT_PREFER_512bit)
# https://reviews.llvm.org/D67259
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mprefer-vector-width=512")
endif()
option(OPT_AGGRESSIVE_FP "Enable DAZ, FTZ and -ffast-math" ON)
if (OPT_AGGRESSIVE_FP)
add_definitions(-DDISALLOW_DENORMALS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math")
endif()
endif()
option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
if (ENABLE_VECTORIZER_REMARKS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "-PGI")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Minfo")
endif()
endif()
######################################################################
option(ENABLE_KOKKOS "Enable Kokkos" OFF)
if (ENABLE_KOKKOS)
FetchContent_Declare(kokkos
......@@ -91,24 +124,6 @@ if (ENABLE_CUDA)
endif()
endif()
option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
if (ENABLE_VECTORIZER_REMARKS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
endif()
endif()
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -g -DNDEBUG")
......
......@@ -29,7 +29,7 @@ int main(int argc, char **argv)
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
std::cout << "Denormals: FTZ and DAZ" << std::endl;
#endif
_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
//_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM);
......@@ -52,9 +52,9 @@ int main(int argc, char **argv)
wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
ofs << sz << " ";
//wec.init();
//time = solve_sequential(wec);
//ofs << time << " ";
wec.init();
time = solve_sequential(wec);
ofs << time << " ";
wec.init();
time = solve_sequential_blocked(wec);
......@@ -68,9 +68,9 @@ int main(int argc, char **argv)
for (size_t threads = 1; threads < maxthreads; threads *= 2)
{
//wec.init();
//time = solve_multithread(wec, threads);
//ofs << time << " ";
wec.init();
time = solve_multithread_nopool(wec, threads);
ofs << time << " ";
}
ofs << std::endl;
......
......@@ -44,7 +44,7 @@ operator<<(std::ostream& os, const tile& tl)
#define BLK_ROWS 56
#define BLK_COLS 56
//#define KERNEL_BLK_USE_MEMCPY
#define KERNEL_BLK_USE_MEMCPY
#define KERNEL_BLK_USE_TEMP
template<typename T>
......@@ -101,7 +101,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
#else
#pragma ivdep
//#pragma ivdep
#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
{
......@@ -114,7 +114,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
#endif /* KERNEL_BLK_USE_TEMP */
for (size_t i = 0; i < BLK_ROWS; i++)
{
#pragma ivdep
//#pragma ivdep
//#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS; j++)
{
......@@ -337,6 +337,7 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
g_next(maxrow-1, j) = 0;
}
#pragma loop distribute(enable)
for (size_t i = 0; i < maxrow; i++)
{
g_next(i, 0) = 0;
......@@ -349,7 +350,7 @@ void
wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params,
size_t thread_id = 0, size_t num_threads = 1)
size_t from = 0, size_t to = 0)
{
int maxrow = params.maxrow;
int maxcol = params.maxcol;
......@@ -357,6 +358,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
T c = params.velocity;
T a = params.damping;
if (to == 0)
to = maxrow;
assert(maxcol > 1);
assert(maxrow > 1);
......@@ -373,8 +377,9 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt);
for (size_t i = thread_id; i < maxrow; i += num_threads)
for (size_t i = from; i < to; i++)
{
#pragma clang loop vectorize(enable)
for (size_t j = 0; j < maxcol; j++)
{
T lapl = 0.0;
......@@ -614,3 +619,94 @@ double solve_multithread(wave_equation_context<T>& wec, size_t nths)
return time/wec.maxiter;
}
template<typename T>
double solve_multithread_nopool(wave_equation_context<T>& wec, size_t nths)
{
/* Simulation parameters */
wave_2D_params<T> params;
params.maxcol = wec.g_curr.domain_cols();
params.maxrow = wec.g_curr.domain_rows();
params.dt = wec.dt;
params.velocity = wec.velocity;
params.damping = wec.damping;
/* Multithreading stuff */
std::mutex cv_mtx;
std::condition_variable prod_cv;
std::condition_variable cons_cv;
std::vector<bool> thread_done(nths);
bool iteration_finished = false;
auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
#ifdef DISALLOW_DENORMALS
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
size_t count = params.maxrow / num_threads;
size_t remainder = params.maxrow % num_threads;
size_t from = 0;
size_t to = 0;
if (thread_id < remainder)
{
from = thread_id * (count + 1);
to = from + count;
}
else
{
from = thread_id * count + remainder;
to = from + (count - 1);
}
wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, from, to);
};
double time = 0.0;
for (size_t iter = 0; iter < wec.maxiter; iter++)
{
auto start = std::chrono::high_resolution_clock::now();
std::vector<std::thread> threads(nths);
for (size_t i = 0; i < nths; i++)
threads[i] = std::thread(thread_lambda, i, nths);
for (auto& th : threads)
th.join();
auto stop = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> ms = stop - start;
time += ms.count();
std::swap(wec.g_prev, wec.g_curr);
std::swap(wec.g_curr, wec.g_next);
#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
if ( (iter%100) == 0 )
{
std::stringstream ss;
ss << "wave_mt_" << iter << ".silo";
visit_dump(wec.g_curr, ss.str());
}
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
}
std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
std::cout << time/wec.maxiter << "ms" << std::endl;
std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
std::cout << time << "ms" << std::endl;
#ifdef HAVE_SILO
visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
#endif /* HAVE_SILO */
return time/wec.maxiter;
}
#include <thread>
int main(int argc, char** argv)
{
for (volatile int i = 0; i < 500000; i++)
std::thread([](){}).detach();
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment