Skip to content
Snippets Groups Projects
Commit d965a1d0 authored by Matteo Cicuttin's avatar Matteo Cicuttin
Browse files

Updated experimental data.

parent f2b143db
No related branches found
No related tags found
No related merge requests found
set term postscript enhanced color
set output 'AllTimes_double_update.eps'
set key left top
set title '2D damped wave equation - double'
set xlabel 'Mesh size'
set ylabel 'Iteration time [ms]'
set style data histogram
set style fill solid border -1
set logscale y
set grid ytics mytics
set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
plot 'AllTimes_double_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
'' u 4 ti col, '' u 5 ti col, '' u 6 ti col
"SIZE" "Seq" "SeqBlk" "1T" "2T" "4T"
128 0.215397 0.0721867 0.231328 0.146418 0.150125
256 0.884406 0.284522 0.996983 0.519418 0.420706
512 3.49626 1.34328 3.55165 1.84903 1.3564
1024 13.9678 5.97425 14.4531 7.56775 5.25945
set term postscript enhanced color
set output 'AllTimes_float_update.eps'
set key left top
set title '2D damped wave equation - float'
set xlabel 'Mesh size'
set ylabel 'Iteration time [ms]'
set style data histogram
set style fill solid border -1
set logscale y
set grid ytics mytics
set arrow from 0,0.1 to 3,6.4 nohead lc rgb 'red'
plot 'AllTimes_float_update.txt' using 2:xtic(1) ti col, '' u 3 ti col, \
'' u 4 ti col, '' u 5 ti col, '' u 6 ti col
"SIZE" "Seq" "SeqBlk" "1T" "2T" "4T"
128 0.285575 0.0328261 0.266828 0.159186 0.1409
256 1.15701 0.13435 1.07333 0.531615 0.387834
512 4.65854 0.57531 4.21764 2.03206 1.42824
1024 13.9774 3.15193 16.5969 8.72612 5.83051
"Size" "GT650M" "GTX760" "i7-3615QM"
128 13.062 45.9258 13.1641
256 16.8322 62.7176 13.3595
512 17.469 67.3603 11.3188
1024 17.0132 68.6386 10.1799
set term postscript enhanced color font ',20'
set output 'i7-3615QM_gflops_double_update.eps'
set title "i7-3615QM SeqBlk double"
set logscale x
set yrange [3:20]
set xlabel "Mesh size"
set ylabel "GFlops/s"
set label "Intel declared: 18.4 GFlops/s" at 150,18.8
set label "DGEMM: 3.83 GFlops/s" at 150,4.2
set arrow 1 from 128,18.4 to 1024,18.4 nohead dt '.'
set arrow 2 from 128,3.83 to 1024,3.83 nohead dt '.'
plot 'CUDA_gflops_double_update.txt' using 1:4:xtic(1) w lp lw 2 ti 'SeqBlk GFlops/s'
......@@ -21,18 +21,22 @@ endif()
find_package(OpenACC)
if (OpenACC_CXX_FOUND)
set(OPENACC_LINK_LIBS ${LINK_LIBS})
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(OPENACC_LINK_LIBS ${OPENACC_LINK_LIBS} -lgomp)
endif()
if (ENABLE_SINGLE)
add_executable(fd_openacc_single fd_openacc.cpp)
target_compile_definitions(fd_openacc_single PUBLIC -DSINGLE_PRECISION)
target_compile_options(fd_openacc_single PUBLIC ${OpenACC_CXX_FLAGS})
target_link_libraries(fd_openacc_single ${LINK_LIBS} -lgomp)
target_link_libraries(fd_openacc_single ${OPENACC_LINK_LIBS})
endif()
if (ENABLE_DOUBLE)
add_executable(fd_openacc_double fd_openacc.cpp)
target_compile_options(fd_openacc_double PUBLIC ${OpenACC_CXX_FLAGS})
target_link_libraries(fd_openacc_double ${LINK_LIBS} -lgomp)
target_link_libraries(fd_openacc_double ${OPENACC_LINK_LIBS})
endif()
endif()
......@@ -87,10 +91,22 @@ if (ENABLE_CUDA)
endif()
endif()
option(ENABLE_VECTORIZER_REMARKS "Enable Clang vectorizer remarks" ON)
option(ENABLE_VECTORIZER_REMARKS "Enable vectorizer remarks" ON)
if (ENABLE_VECTORIZER_REMARKS)
if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -qopt-report-phase=vec -qopt-report=2")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "-Minfo")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopt-info-vec-optimized")
endif()
endif()
......
......@@ -29,8 +29,9 @@ int main(int argc, char **argv)
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
std::cout << "Denormals: FTZ and DAZ" << std::endl;
#endif
_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_INVALID);
//_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~_MM_MASK_DENORM);
/* Make header */
ofs << "\"SIZE\" \"Seq\" \"SeqBlk\" ";
......@@ -51,9 +52,9 @@ int main(int argc, char **argv)
wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
ofs << sz << " ";
wec.init();
time = solve_sequential(wec);
ofs << time << " ";
//wec.init();
//time = solve_sequential(wec);
//ofs << time << " ";
wec.init();
time = solve_sequential_blocked(wec);
......@@ -67,9 +68,9 @@ int main(int argc, char **argv)
for (size_t threads = 1; threads < maxthreads; threads *= 2)
{
wec.init();
time = solve_multithread(wec, threads);
ofs << time << " ";
//wec.init();
//time = solve_multithread(wec, threads);
//ofs << time << " ";
}
ofs << std::endl;
......
......@@ -44,6 +44,8 @@ operator<<(std::ostream& os, const tile& tl)
#define BLK_ROWS 56
#define BLK_COLS 56
//#define KERNEL_BLK_USE_MEMCPY
#define KERNEL_BLK_USE_TEMP
template<typename T>
void
......@@ -51,9 +53,6 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
fd_grid<T>& g_next,
const wave_2D_params<T>& params)
{
/* 1) memcpy is faster than for
* 2) if you put inner block processing, you kill perf
*/
int maxrow = params.maxrow;
int maxcol = params.maxcol;
T dt = params.dt;
......@@ -71,10 +70,12 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
#ifdef KERNEL_BLK_USE_TEMP
T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
#endif
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt);
......@@ -91,19 +92,34 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
{
/* Here all the dimensions are known at compile time, let the
* compiler do its job. */
#ifdef KERNEL_BLK_USE_TEMP
for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
{
#ifdef KERNEL_BLK_USE_MEMCPY
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
#else
#pragma ivdep
#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
{
int ofs_x = bj + j - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
s_curr[i][j] = g_curr(ofs_y, ofs_x);
}
#endif /* KERNEL_BLK_USE_MEMCPY */
}
#endif /* KERNEL_BLK_USE_TEMP */
for (size_t i = 0; i < BLK_ROWS; i++)
{
#pragma ivdep
//#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS; j++)
{
T lapl = 0.0;
#ifdef KERNEL_BLK_USE_TEMP
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
......@@ -111,8 +127,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
#else
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * g_curr(bi+i, bj+j);
#endif /* KERNEL_BLK_USE_TEMP */
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
......@@ -124,22 +151,37 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
/* Do upper blocks */
{
size_t bi = blocks_y*BLK_ROWS;
for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
{
for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
#ifdef KERNEL_BLK_USE_TEMP
for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
{
#ifdef KERNEL_BLK_USE_MEMCPY
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
#else
#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS+2*WAVE_8_HALO_SIZE; j++)
{
int ofs_x = bj + j - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
s_curr[i][j] = g_curr(ofs_y, ofs_x);
}
#endif
}
#endif /* KERNEL_BLK_USE_TEMP */
for (size_t i = 0; i < rem_y; i++)
{
#pragma ivdep
//#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < BLK_COLS; j++)
{
T lapl = 0.0;
#ifdef KERNEL_BLK_USE_TEMP
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
......@@ -147,8 +189,19 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
#else
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * g_curr(bi+i, bj+j);
#endif /* KERNEL_BLK_USE_TEMP */
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
......@@ -162,29 +215,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
size_t bi = blocks_y*BLK_ROWS;
size_t bj = blocks_x*BLK_COLS;
for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
#ifdef KERNEL_BLK_USE_TEMP
for (size_t i = 0; i < rem_y+2*WAVE_8_HALO_SIZE; i++)
{
#ifdef KERNEL_BLK_USE_MEMCPY
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
#else
#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
{
int ofs_x = bj + j - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
s_curr[i][j] = g_curr(ofs_y, ofs_x);
}
#endif
}
#endif /* KERNEL_BLK_USE_TEMP */
for (size_t i = 0; i < rem_y; i++)
{
#pragma ivdep
//#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < rem_x; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
#ifdef KERNEL_BLK_USE_TEMP
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
#else
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * g_curr(bi+i, bj+j);
#endif /* KERNEL_BLK_USE_TEMP */
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
......@@ -197,29 +275,54 @@ wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
{
size_t bj = blocks_x*BLK_COLS;
#ifdef KERNEL_BLK_USE_TEMP
for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
{
#ifdef KERNEL_BLK_USE_MEMCPY
int ofs_x = bj - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+2*WAVE_8_HALO_SIZE)*sizeof(T));
#else
#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < rem_x+2*WAVE_8_HALO_SIZE; j++)
{
int ofs_x = bj + j - WAVE_8_HALO_SIZE;
int ofs_y = bi + i - WAVE_8_HALO_SIZE;
s_curr[i][j] = g_curr(ofs_y, ofs_x);
}
#endif
}
#endif /* KERNEL_BLK_USE_TEMP */
for (size_t i = 0; i < BLK_ROWS; i++)
{
#pragma ivdep
//#pragma clang loop vectorize(assume_safety)
for (size_t j = 0; j < rem_x; j++)
{
T lapl = 0.0;
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
#ifdef KERNEL_BLK_USE_TEMP
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
T val = lapl -
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
#else
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i, bj+j+k);
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(bi+i+k, bj+j);
T val = lapl -
one_minus_adt * g_prev(bi+i, bj+j) +
two_minus_adt * g_curr(bi+i, bj+j);
#endif /* KERNEL_BLK_USE_TEMP */
assert(j < BLK_COLS);
g_next(bi+i, bj+j) = val;
......@@ -265,8 +368,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
static const T w4 = -1.0/560.0;
static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
T kx2 = (c*dt*(maxcol-1)) * (c*dt*(maxcol-1));
T ky2 = (c*dt*(maxrow-1)) * (c*dt*(maxrow-1));
T one_minus_adt = (1.0 - a*dt);
T two_minus_adt = (2.0 - a*dt);
......@@ -281,7 +384,8 @@ wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);
T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
T val = lapl - one_minus_adt * g_prev(i, j);
val += two_minus_adt * g_curr(i, j);
if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
val = 0;
......
......@@ -3,6 +3,9 @@
#include <vector>
#include <chrono>
#include <pmmintrin.h>
#include <xmmintrin.h>
#define YMIN (-1.5)
#define YMAX (1.5)
#define XMIN (-2.5)
......@@ -56,8 +59,6 @@ void gpu_mandel(T *mandel, int maxiter)
mandel[i*SIZE_X + j] = iter;
}
#endif /* __NVCC__ */
template<typename T>
void compute_gpu(std::vector<T>& mandel, int maxiter)
{
......@@ -65,10 +66,12 @@ void compute_gpu(std::vector<T>& mandel, int maxiter)
dim3 threads_per_block(WAVE_8_KER_COLS, WAVE_8_KER_ROWS);
}
#endif /* __NVCC__ */
int main(void)
{
size_t maxiter = 1000;
using T = double;
using T = float;
std::vector<T> mandel(SIZE_X*SIZE_Y);
......@@ -76,7 +79,9 @@ int main(void)
size_t flops = 0;
#pragma omp parallel for
//_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
//_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
for (size_t i = 0; i < SIZE_Y; i++)
{
for (size_t j = 0; j < SIZE_X; j++)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment