fd_wave_cpu.hpp

/* Performance evaluation of different implementations of finite difference
 * schemes.
 *
 * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
 */

#include <iostream>
#include <thread>
#include <condition_variable>
#include <cassert>
#include <list>
#include <cmath>
#include <algorithm>
#include <sstream>

#include <string.h>
#include <pmmintrin.h>
#include <xmmintrin.h>

#include "fd_grid.hpp"
#include "fd_dataio.hpp"
#include "fd_wave.hpp"

struct tile
{
    int dim_x;
    int dim_y;
    int idx_x;
    int idx_y;

    tile() {}

    tile(int dx, int dy, int ix, int iy)
        : dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
    {}
};

std::ostream&
operator<<(std::ostream& os, const tile& tl)
{
    os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
    return os;
}

#define BLK_ROWS 56
#define BLK_COLS 56

template<typename T>
void
wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
               fd_grid<T>& g_next,
               const wave_2D_params<T>& params)
{
    /* 1) memcpy is faster than for
     * 2) if you put inner block processing, you kill perf
     */
    int maxrow  = params.maxrow;
    int maxcol  = params.maxcol;
    T   dt      = params.dt;
    T   c       = params.velocity;
    T   a       = params.damping;

    assert(maxcol > 1);
    assert(maxrow > 1);

    /**** Initialize constants ****/
    static const T w0 = -205.0/72.0;
    static const T w1 = 8.0/5.0;
    static const T w2 = -1.0/5.0;
    static const T w3 = 8.0/315.0;
    static const T w4 = -1.0/560.0;
    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };

    T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];

    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

    size_t blocks_x = maxcol / BLK_COLS;
    size_t blocks_y = maxrow / BLK_ROWS;
    size_t rem_x = maxcol % BLK_COLS;
    size_t rem_y = maxrow % BLK_ROWS;


    /* Do fixed-size blocks */
    for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
    {
        for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
        {
            /* Here all the dimensions are known at compile time, let the
             * compiler do its job. */
            for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
            {
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
            }

            for (size_t i = 0; i < BLK_ROWS; i++)
            {
                for (size_t j = 0; j < BLK_COLS; j++)
                {
                    T lapl = 0.0;
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];

                    T val = lapl -
                            one_minus_adt * g_prev(bi+i, bj+j) +
                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
                }
            }
        }
    }

    /* Do upper blocks */
    {
        size_t bi = blocks_y*BLK_ROWS;
        for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
        {
            for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
            {
                
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
            }

            for (size_t i = 0; i < rem_y; i++)
            {
                for (size_t j = 0; j < BLK_COLS; j++)
                {
                    T lapl = 0.0;
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];

                    T val = lapl -
                            one_minus_adt * g_prev(bi+i, bj+j) +
                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
                }
            }
        }
    }

    /* Do upper leftmost corner */
    {
        size_t bi = blocks_y*BLK_ROWS;
        size_t bj = blocks_x*BLK_COLS;

        for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
        {
            
            int ofs_x = bj - WAVE_8_HALO_SIZE;
            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
        }

        for (size_t i = 0; i < rem_y; i++)
        {
            for (size_t j = 0; j < rem_x; j++)
            {
                T lapl = 0.0;
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
        
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];

                T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                
                assert(j < BLK_COLS);
                g_next(bi+i, bj+j) = val;
            }
        }
    }

    /* Do leftmost blocks */
    for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
    {
        size_t bj = blocks_x*BLK_COLS;

        for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
        {
            
            int ofs_x = bj - WAVE_8_HALO_SIZE;
            int ofs_y = bi + i - WAVE_8_HALO_SIZE;
            assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
            memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
        }

        for (size_t i = 0; i < BLK_ROWS; i++)
        {
            for (size_t j = 0; j < rem_x; j++)
            {
                T lapl = 0.0;
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
        
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];

                T val = lapl -
                        one_minus_adt * g_prev(bi+i, bj+j) +
                        two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                
                assert(j < BLK_COLS);
                g_next(bi+i, bj+j) = val;
            }
        }
    }

    /* Apply BCs */
    for (size_t j = 0; j < maxcol; j++)
    {
        g_next(0,j) = 0;
        g_next(maxrow-1, j) = 0;
    }

    for (size_t i = 0; i < maxrow; i++)
    {
        g_next(i, 0) = 0;
        g_next(i, maxcol-1) = 0;
    }
}

template<typename T>
void
wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
               fd_grid<T>& g_next,
               const wave_2D_params<T>& params,
               size_t thread_id = 0, size_t num_threads = 1)
{
    int maxrow  = params.maxrow;
    int maxcol  = params.maxcol;
    T   dt      = params.dt;
    T   c       = params.velocity;
    T   a       = params.damping;

    assert(maxcol > 1);
    assert(maxrow > 1);

    /**** Initialize constants ****/
    static const T w0 = -205.0/72.0;
    static const T w1 = 8.0/5.0;
    static const T w2 = -1.0/5.0;
    static const T w3 = 8.0/315.0;
    static const T w4 = -1.0/560.0;
    static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };

    T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
    T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
    T one_minus_adt = (1.0 - a*dt);
    T two_minus_adt = (2.0 - a*dt);

    for (size_t i = thread_id; i < maxrow; i += num_threads)
    {
        for (size_t j = 0; j < maxcol; j++)
        {
            T lapl = 0.0;
            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i,j+k);
            for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);

            T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);

            if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                val = 0;
            
            g_next(i, j) = val;
        }
    }
}

template<typename T, bool blocked>
double
solve_sequential_aux(wave_equation_context<T>& wec)
{
    /* Simulation parameters */
    wave_2D_params<T> params;
    params.maxcol = wec.g_curr.domain_cols();
    params.maxrow = wec.g_curr.domain_rows();
    params.dt = wec.dt;
    params.velocity = wec.velocity;
    params.damping = wec.damping;

    tile tl;
    tl.dim_x = params.maxcol;
    tl.dim_y = params.maxrow;
    tl.idx_x = 0;
    tl.idx_y = 0;

    double time = 0.0;

#ifdef SAVE_ITERTIME
    std::ofstream ofs;   
    if (blocked)
        ofs.open("itertime-blocked.txt");
    else
        ofs.open("itertime-naive.txt");
#endif /* SAVE_ITERTIME */

    for (size_t i = 0; i < wec.maxiter; i++)
    {
        auto start = std::chrono::high_resolution_clock::now();

        if (blocked)
            wave_2D_kernel_blk(wec.g_prev, wec.g_curr, wec.g_next, params);
        else
            wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params);

        auto stop = std::chrono::high_resolution_clock::now();

        std::chrono::duration<double, std::milli> ms = stop - start;
        time += ms.count();
        
        std::swap(wec.g_prev, wec.g_curr);
        std::swap(wec.g_curr, wec.g_next);

#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
        if ( (i%100) == 0 )
        {
            std::stringstream ss;
            if (blocked)
                ss << "wave_seqblk_" << i << ".silo";
            else
                ss << "wave_seq_" << i << ".silo";
            visit_dump(wec.g_curr, ss.str());
        }
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */

#ifdef SAVE_ITERTIME
        ofs << i << " " << ms.count() << std::endl;
#endif /* SAVE_ITERTIME */
    }
    
    if (blocked)
    {
        std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
        std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;

        double itertime = time/wec.maxiter;
        double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
        std::cout << "[Wave][SeqBlk] GFlops/s: " << gflops_s << std::endl;
    }
    else
    {
        std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
        std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
        
        double itertime = time/wec.maxiter;
        double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
        std::cout << "[Wave][Sequential] GFlops/s: " << gflops_s << std::endl;

        size_t kernel_bytes = 3*sizeof(T)*(params.maxrow*params.maxcol);
        double gbytes_s = kernel_bytes/(1e6*itertime);
        std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
    }

#ifdef HAVE_SILO
    if (blocked)
        visit_dump(wec.g_curr, "wave_seqblk_lastiter.silo");
    else
        visit_dump(wec.g_curr, "wave_seq_lastiter.silo");
#endif /* HAVE_SILO */

    return time/wec.maxiter;
}

template<typename T>
double solve_sequential(wave_equation_context<T>& wec)
{
    return solve_sequential_aux<T,false>(wec);
}

template<typename T>
double solve_sequential_blocked(wave_equation_context<T>& wec)
{
    return solve_sequential_aux<T,true>(wec);
}

template<typename T>
double solve_multithread(wave_equation_context<T>& wec, size_t nths)
{
    /* Simulation parameters */
    wave_2D_params<T> params;
    params.maxcol = wec.g_curr.domain_cols();
    params.maxrow = wec.g_curr.domain_rows();
    params.dt = wec.dt;
    params.velocity = wec.velocity;
    params.damping = wec.damping;


    /* Multithreading stuff */
    std::mutex                  cv_mtx;
    std::condition_variable     prod_cv;
    std::condition_variable     cons_cv;
    std::vector<bool>           thread_done(nths);
    bool                        iteration_finished = false;

    auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
#ifdef DISALLOW_DENORMALS
    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
        while (1)
        {
            /* Wait for the producer to notify that there's something to do */
            {
                std::unique_lock<std::mutex> lck(cv_mtx);
                while ( thread_done[thread_id] )
                    cons_cv.wait(lck);

                if (iteration_finished)
                    return;
            }

            /* Do the timestep */
            wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);

            /* Work for this thread finished, notify producer */
            std::unique_lock<std::mutex> lck(cv_mtx);
            prod_cv.notify_one();
            thread_done[thread_id] = true;
        }
    };


    std::vector<std::thread> threads(nths);
    for (size_t i = 0; i < nths; i++)
        threads[i] = std::thread(thread_lambda, i, nths);

    double time = 0.0;

    for (size_t iter = 0; iter < wec.maxiter; iter++)
    {
        auto start = std::chrono::high_resolution_clock::now();

        std::unique_lock<std::mutex> lck(cv_mtx);
        /* Mark data ready and start the threads */
        for (size_t i = 0; i < nths; i++)
            thread_done[i] = false;
        cons_cv.notify_all();

        while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
            prod_cv.wait(lck);

        auto stop = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> ms = stop - start;
        time += ms.count();

        std::swap(wec.g_prev, wec.g_curr);
        std::swap(wec.g_curr, wec.g_next);

#ifdef HAVE_SILO
#ifdef SAVE_TIMESTEPS
        if ( (iter%100) == 0 )
        {
            std::stringstream ss;
            ss << "wave_mt_" << iter << ".silo";
            visit_dump(wec.g_curr, ss.str());
        }
#endif /* SAVE_TIMESTEPS */
#endif /* HAVE_SILO */
    }

    /* Tell all the threads to stop */
    {
        std::unique_lock<std::mutex> lck(cv_mtx);
        for (size_t i = 0; i < nths; i++)
            thread_done[i] = false;
        iteration_finished = true;
        cons_cv.notify_all();
    }

    /* Wait for all the threads to finish */
    for (auto& th : threads)
        th.join();
    
    std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
    std::cout << time/wec.maxiter << "ms" << std::endl;
    std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
    std::cout << time << "ms" << std::endl;

#ifdef HAVE_SILO
    visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
#endif /* HAVE_SILO */

    return time/wec.maxiter;
}