Skip to content
Snippets Groups Projects
Select Git revision
  • a3221785cab590b387209068b407cbd5e61faf80
  • master default protected
  • func-supercond
  • dof-renumbering
  • test-dof-hash
  • gdemesy-master-patch-30528
  • eval-space-time
  • oscillating_multiharm
  • MH_movement
  • axisqu
  • write_vtu_and_ensight_formats
  • movingband
  • CP_1972_add_vtu_file_writing
  • mortar
  • fast_freq_sweep_Resolution
  • applyresolvent_again
  • marteaua-master-patch-54323
  • patch-1
  • binde-master-patch-08072
  • binde-master-patch-52461
  • BCGSL
  • getdp_3_5_0
  • getdp_3_4_0
  • getdp_3_3_0
  • getdp_3_2_0
  • getdp_3_1_0
  • getdp_3_0_4
  • getdp_3_0_3
  • getdp_3_0_2
  • getdp_3_0_1
  • getdp_3_0_0
  • onelab_mobile_2.1.0
  • getdp_2_11_3 protected
  • getdp_2_11_2 protected
  • getdp_2_11_1 protected
  • getdp_2_11_0 protected
  • getdp_2_10_0 protected
  • getdp_2_9_2 protected
  • getdp_2_9_1 protected
  • getdp_2_9_0 protected
  • getdp_2_8_0 protected
41 results

Cal_SmallFemTermOfFemEquation.h

Blame
  • fd_wave_cpu.hpp 15.73 KiB
    /* Performance evaluation of different implementations of finite difference
     * schemes.
     *
     * Matteo Cicuttin (C) 2020, Université de Liège, ACE Group
     */
    
    #include <iostream>
    #include <thread>
    #include <condition_variable>
    #include <cassert>
    #include <list>
    #include <cmath>
    #include <algorithm>
    #include <sstream>
    
    #include <string.h>
    #include <pmmintrin.h>
    #include <xmmintrin.h>
    
    #include "fd_grid.hpp"
    #include "fd_dataio.hpp"
    #include "fd_wave.hpp"
    
    struct tile
    {
        int dim_x;
        int dim_y;
        int idx_x;
        int idx_y;
    
        tile() {}
    
        tile(int dx, int dy, int ix, int iy)
            : dim_x(dx), dim_y(dy), idx_x(ix), idx_y(iy)
        {}
    };
    
    std::ostream&
    operator<<(std::ostream& os, const tile& tl)
    {
        os << "tile (" << tl.idx_x << ", " << tl.idx_y << ") " << tl.dim_x << ", " << tl.dim_y;
        return os;
    }
    
    #define BLK_ROWS 56
    #define BLK_COLS 56
    
    template<typename T>
    void
    wave_2D_kernel_blk(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                   fd_grid<T>& g_next,
                   const wave_2D_params<T>& params)
    {
        /* 1) memcpy is faster than for
         * 2) if you put inner block processing, you kill perf
         */
        int maxrow  = params.maxrow;
        int maxcol  = params.maxcol;
        T   dt      = params.dt;
        T   c       = params.velocity;
        T   a       = params.damping;
    
        assert(maxcol > 1);
        assert(maxrow > 1);
    
        /**** Initialize constants ****/
        static const T w0 = -205.0/72.0;
        static const T w1 = 8.0/5.0;
        static const T w2 = -1.0/5.0;
        static const T w3 = 8.0/315.0;
        static const T w4 = -1.0/560.0;
        static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
    
        T s_curr[BLK_ROWS+2*WAVE_8_HALO_SIZE][BLK_COLS+2*WAVE_8_HALO_SIZE];
    
        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
        T one_minus_adt = (1.0 - a*dt);
        T two_minus_adt = (2.0 - a*dt);
    
        size_t blocks_x = maxcol / BLK_COLS;
        size_t blocks_y = maxrow / BLK_ROWS;
        size_t rem_x = maxcol % BLK_COLS;
        size_t rem_y = maxrow % BLK_ROWS;
    
    
        /* Do fixed-size blocks */
        for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
        {
            for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
            {
                /* Here all the dimensions are known at compile time, let the
                 * compiler do its job. */
                for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
                {
                    int ofs_x = bj - WAVE_8_HALO_SIZE;
                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                    assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                    memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
                }
    
                for (size_t i = 0; i < BLK_ROWS; i++)
                {
                    for (size_t j = 0; j < BLK_COLS; j++)
                    {
                        T lapl = 0.0;
                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                            lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
                
                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                            lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
    
                        T val = lapl -
                                one_minus_adt * g_prev(bi+i, bj+j) +
                                two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                        
                        assert(j < BLK_COLS);
                        g_next(bi+i, bj+j) = val;
                    }
                }
            }
        }
    
        /* Do upper blocks */
        {
            size_t bi = blocks_y*BLK_ROWS;
            for (size_t bj = 0; bj < blocks_x*BLK_COLS; bj += BLK_COLS)
            {
                for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
                {
                    
                    int ofs_x = bj - WAVE_8_HALO_SIZE;
                    int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                    assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                    memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
                }
    
                for (size_t i = 0; i < rem_y; i++)
                {
                    for (size_t j = 0; j < BLK_COLS; j++)
                    {
                        T lapl = 0.0;
                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                            lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
                
                        for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                            lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
    
                        T val = lapl -
                                one_minus_adt * g_prev(bi+i, bj+j) +
                                two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                        
                        assert(j < BLK_COLS);
                        g_next(bi+i, bj+j) = val;
                    }
                }
            }
        }
    
        /* Do upper leftmost corner */
        {
            size_t bi = blocks_y*BLK_ROWS;
            size_t bj = blocks_x*BLK_COLS;
    
            for (size_t i = 0; i < rem_y+WAVE_8_HALO_SIZE; i++)
            {
                
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (BLK_COLS+2*WAVE_8_HALO_SIZE)*sizeof(T));
            }
    
            for (size_t i = 0; i < rem_y; i++)
            {
                for (size_t j = 0; j < rem_x; j++)
                {
                    T lapl = 0.0;
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
    
                    T val = lapl -
                            one_minus_adt * g_prev(bi+i, bj+j) +
                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
                }
            }
        }
    
        /* Do leftmost blocks */
        for (size_t bi = 0; bi < blocks_y*BLK_ROWS; bi += BLK_ROWS)
        {
            size_t bj = blocks_x*BLK_COLS;
    
            for (size_t i = 0; i < BLK_ROWS+2*WAVE_8_HALO_SIZE; i++)
            {
                
                int ofs_x = bj - WAVE_8_HALO_SIZE;
                int ofs_y = bi + i - WAVE_8_HALO_SIZE;
                assert(i < BLK_ROWS+2*WAVE_8_HALO_SIZE);
                memcpy(s_curr[i], g_curr.data(ofs_y, ofs_x), (rem_x+WAVE_8_HALO_SIZE)*sizeof(T));
            }
    
            for (size_t i = 0; i < BLK_ROWS; i++)
            {
                for (size_t j = 0; j < rem_x; j++)
                {
                    T lapl = 0.0;
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE+k];
            
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * s_curr[i+WAVE_8_HALO_SIZE+k][j+WAVE_8_HALO_SIZE];
    
                    T val = lapl -
                            one_minus_adt * g_prev(bi+i, bj+j) +
                            two_minus_adt * s_curr[i+WAVE_8_HALO_SIZE][j+WAVE_8_HALO_SIZE];
                    
                    assert(j < BLK_COLS);
                    g_next(bi+i, bj+j) = val;
                }
            }
        }
    
        /* Apply BCs */
        for (size_t j = 0; j < maxcol; j++)
        {
            g_next(0,j) = 0;
            g_next(maxrow-1, j) = 0;
        }
    
        for (size_t i = 0; i < maxrow; i++)
        {
            g_next(i, 0) = 0;
            g_next(i, maxcol-1) = 0;
        }
    }
    
    template<typename T>
    void
    wave_2D_kernel(const fd_grid<T>& g_prev, const fd_grid<T>& g_curr,
                   fd_grid<T>& g_next,
                   const wave_2D_params<T>& params,
                   size_t thread_id = 0, size_t num_threads = 1)
    {
        int maxrow  = params.maxrow;
        int maxcol  = params.maxcol;
        T   dt      = params.dt;
        T   c       = params.velocity;
        T   a       = params.damping;
    
        assert(maxcol > 1);
        assert(maxrow > 1);
    
        /**** Initialize constants ****/
        static const T w0 = -205.0/72.0;
        static const T w1 = 8.0/5.0;
        static const T w2 = -1.0/5.0;
        static const T w3 = 8.0/315.0;
        static const T w4 = -1.0/560.0;
        static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
    
        T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
        T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
        T one_minus_adt = (1.0 - a*dt);
        T two_minus_adt = (2.0 - a*dt);
    
        for (size_t i = thread_id; i < maxrow; i += num_threads)
        {
            for (size_t j = 0; j < maxcol; j++)
            {
                T lapl = 0.0;
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i,j+k);
        
                for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                    lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * g_curr(i+k,j);
    
                T val = lapl - one_minus_adt * g_prev(i, j) + two_minus_adt * g_curr(i, j);
    
                if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                    val = 0;
                
                g_next(i, j) = val;
            }
        }
    }
    
    template<typename T, bool blocked>
    double
    solve_sequential_aux(wave_equation_context<T>& wec)
    {
        /* Simulation parameters */
        wave_2D_params<T> params;
        params.maxcol = wec.g_curr.domain_cols();
        params.maxrow = wec.g_curr.domain_rows();
        params.dt = wec.dt;
        params.velocity = wec.velocity;
        params.damping = wec.damping;
    
        tile tl;
        tl.dim_x = params.maxcol;
        tl.dim_y = params.maxrow;
        tl.idx_x = 0;
        tl.idx_y = 0;
    
        double time = 0.0;
    
    #ifdef SAVE_ITERTIME
        std::ofstream ofs;   
        if (blocked)
            ofs.open("itertime-blocked.txt");
        else
            ofs.open("itertime-naive.txt");
    #endif /* SAVE_ITERTIME */
    
        for (size_t i = 0; i < wec.maxiter; i++)
        {
            auto start = std::chrono::high_resolution_clock::now();
    
            if (blocked)
                wave_2D_kernel_blk(wec.g_prev, wec.g_curr, wec.g_next, params);
            else
                wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params);
    
            auto stop = std::chrono::high_resolution_clock::now();
    
            std::chrono::duration<double, std::milli> ms = stop - start;
            time += ms.count();
            
            std::swap(wec.g_prev, wec.g_curr);
            std::swap(wec.g_curr, wec.g_next);
    
    #ifdef HAVE_SILO
    #ifdef SAVE_TIMESTEPS
            if ( (i%100) == 0 )
            {
                std::stringstream ss;
                if (blocked)
                    ss << "wave_seqblk_" << i << ".silo";
                else
                    ss << "wave_seq_" << i << ".silo";
                visit_dump(wec.g_curr, ss.str());
            }
    #endif /* SAVE_TIMESTEPS */
    #endif /* HAVE_SILO */
    
    #ifdef SAVE_ITERTIME
            ofs << i << " " << ms.count() << std::endl;
    #endif /* SAVE_ITERTIME */
        }
        
        if (blocked)
        {
            std::cout << "[Wave][SeqBlk] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
            std::cout << "[Wave][SeqBlk] Wall Time: " << time << "ms" << std::endl;
    
            double itertime = time/wec.maxiter;
            double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
            std::cout << "[Wave][SeqBlk] GFlops/s: " << gflops_s << std::endl;
        }
        else
        {
            std::cout << "[Wave][Sequential] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
            std::cout << "[Wave][Sequential] Wall Time: " << time << "ms" << std::endl;
            
            double itertime = time/wec.maxiter;
            double gflops_s = 58*(params.maxrow*params.maxcol)/(1e6*itertime);
            std::cout << "[Wave][Sequential] GFlops/s: " << gflops_s << std::endl;
    
            size_t kernel_bytes = 3*sizeof(T)*(params.maxrow*params.maxcol);
            double gbytes_s = kernel_bytes/(1e6*itertime);
            std::cout << "[Wave][Sequential] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
        }
    
    #ifdef HAVE_SILO
        if (blocked)
            visit_dump(wec.g_curr, "wave_seqblk_lastiter.silo");
        else
            visit_dump(wec.g_curr, "wave_seq_lastiter.silo");
    #endif /* HAVE_SILO */
    
        return time/wec.maxiter;
    }
    
    template<typename T>
    double solve_sequential(wave_equation_context<T>& wec)
    {
        return solve_sequential_aux<T,false>(wec);
    }
    
    template<typename T>
    double solve_sequential_blocked(wave_equation_context<T>& wec)
    {
        return solve_sequential_aux<T,true>(wec);
    }
    
    template<typename T>
    double solve_multithread(wave_equation_context<T>& wec, size_t nths)
    {
        /* Simulation parameters */
        wave_2D_params<T> params;
        params.maxcol = wec.g_curr.domain_cols();
        params.maxrow = wec.g_curr.domain_rows();
        params.dt = wec.dt;
        params.velocity = wec.velocity;
        params.damping = wec.damping;
    
    
        /* Multithreading stuff */
        std::mutex                  cv_mtx;
        std::condition_variable     prod_cv;
        std::condition_variable     cons_cv;
        std::vector<bool>           thread_done(nths);
        bool                        iteration_finished = false;
    
        auto thread_lambda = [&](size_t thread_id, size_t num_threads) {
    #ifdef DISALLOW_DENORMALS
        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
        _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
    #endif
            while (1)
            {
                /* Wait for the producer to notify that there's something to do */
                {
                    std::unique_lock<std::mutex> lck(cv_mtx);
                    while ( thread_done[thread_id] )
                        cons_cv.wait(lck);
    
                    if (iteration_finished)
                        return;
                }
    
                /* Do the timestep */
                wave_2D_kernel(wec.g_prev, wec.g_curr, wec.g_next, params, thread_id, num_threads);
    
                /* Work for this thread finished, notify producer */
                std::unique_lock<std::mutex> lck(cv_mtx);
                prod_cv.notify_one();
                thread_done[thread_id] = true;
            }
        };
    
    
        std::vector<std::thread> threads(nths);
        for (size_t i = 0; i < nths; i++)
            threads[i] = std::thread(thread_lambda, i, nths);
    
        double time = 0.0;
    
        for (size_t iter = 0; iter < wec.maxiter; iter++)
        {
            auto start = std::chrono::high_resolution_clock::now();
    
            std::unique_lock<std::mutex> lck(cv_mtx);
            /* Mark data ready and start the threads */
            for (size_t i = 0; i < nths; i++)
                thread_done[i] = false;
            cons_cv.notify_all();
    
            while ( !std::all_of(thread_done.begin(), thread_done.end(), [](bool x) -> bool { return x; } ) )
                prod_cv.wait(lck);
    
            auto stop = std::chrono::high_resolution_clock::now();
            std::chrono::duration<double, std::milli> ms = stop - start;
            time += ms.count();
    
            std::swap(wec.g_prev, wec.g_curr);
            std::swap(wec.g_curr, wec.g_next);
    
    #ifdef HAVE_SILO
    #ifdef SAVE_TIMESTEPS
            if ( (iter%100) == 0 )
            {
                std::stringstream ss;
                ss << "wave_mt_" << iter << ".silo";
                visit_dump(wec.g_curr, ss.str());
            }
    #endif /* SAVE_TIMESTEPS */
    #endif /* HAVE_SILO */
        }
    
        /* Tell all the threads to stop */
        {
            std::unique_lock<std::mutex> lck(cv_mtx);
            for (size_t i = 0; i < nths; i++)
                thread_done[i] = false;
            iteration_finished = true;
            cons_cv.notify_all();
        }
    
        /* Wait for all the threads to finish */
        for (auto& th : threads)
            th.join();
        
        std::cout << "[Wave][MT] Iteration Time (" << nths << " threads): ";
        std::cout << time/wec.maxiter << "ms" << std::endl;
        std::cout << "[Wave][MT] Wall Time (" << nths << " threads): ";
        std::cout << time << "ms" << std::endl;
    
    #ifdef HAVE_SILO
        visit_dump(wec.g_curr, "wave_mt_lastiter.silo");
    #endif /* HAVE_SILO */
    
        return time/wec.maxiter;
    }