Skip to content
Snippets Groups Projects
Select Git revision
  • f2b143dbeefdf1635810f1f2176e428302464d8f
  • master default protected
2 results

fd_openacc.cpp

Blame
  • fd_openacc.cpp 4.48 KiB
    #include <iostream>
    #include <fstream>
    #include <cstdio>
    #include <unistd.h>
    
    #include "fd_wave_cpu.hpp"
    
    /* pgc++ -O3 -I /home/math0471p/matteo/mysoft/silo/include/ -L /home/math0471p/matteo/mysoft/silo/lib/ -DHAVE_SILO -DSAVE_TIMESTEPS -acc -ta=nvidia:managed,time -Minfo=accel fd_openacc.cpp -lsilo */
    
    template<typename T>
    double solve_openacc(wave_equation_context<T>& wec)
    {
        /* Simulation parameters */
        int maxcol = wec.g_curr.domain_cols();
        int maxrow = wec.g_curr.domain_rows();
        T dt = wec.dt;
        T c = wec.velocity;
        T a = wec.damping;
    
        assert(maxcol > 1);
        assert(maxrow > 1);
    
        double time = 0.0;
    
    #ifdef SAVE_ITERTIME
        std::ofstream ofs;   
        ofs.open("itertime-naive.txt");
    #endif /* SAVE_ITERTIME */
    
        /**** Initialize constants ****/
        static const T w0 = -205.0/72.0;
        static const T w1 = 8.0/5.0;
        static const T w2 = -1.0/5.0;
        static const T w3 = 8.0/315.0;
        static const T w4 = -1.0/560.0;
        static const T w[9] = { w4, w3, w2, w1, w0, w1, w2, w3, w4 };
    
        T * __restrict__ u_prev       = wec.g_prev.data();
        T * __restrict__ u_curr       = wec.g_curr.data();
        T * __restrict__ u_next       = wec.g_next.data();
        size_t nelem    = wec.g_curr.size();
    
    #define U_OFFSET(i,j) ( (2*WAVE_8_HALO_SIZE+maxcol)*(i+WAVE_8_HALO_SIZE) + (j+WAVE_8_HALO_SIZE) )
    
    //#pragma acc data copy(u_prev[0:nelem])
    //#pragma acc data copy(u_curr[0:nelem])
    //#pragma acc data copy(u_next[0:nelem])
        for (size_t iter = 0; iter < wec.maxiter; iter++)
        {
            auto start = std::chrono::high_resolution_clock::now();
    
            T kx2 = c*c * dt*dt * (maxcol-1)*(maxcol-1);
            T ky2 = c*c * dt*dt * (maxrow-1)*(maxrow-1);
            T one_minus_adt = (1.0 - a*dt);
            T two_minus_adt = (2.0 - a*dt);
    
    #pragma omp parallel for shared(maxrow, maxcol, u_prev, u_curr, u_next)
    #pragma acc kernels
            #pragma acc loop independent
            for (size_t i = 0; i < maxrow; i++)
            {
                #pragma acc loop independent
                for (size_t j = 0; j < maxcol; j++)
                {
                    T lapl = 0.0;
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += kx2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i,j+k) ];
            
                    for (int k = -WAVE_8_HALO_SIZE; k <= WAVE_8_HALO_SIZE; k++)
                        lapl += ky2 * w[k+WAVE_8_HALO_SIZE] * u_curr[ U_OFFSET(i+k,j) ];
    
                    T val = lapl - 
                            one_minus_adt * u_prev[ U_OFFSET(i,j) ] + 
                            two_minus_adt * u_curr[ U_OFFSET(i,j) ];
    
                    if ( (i == 0) or (j == 0) or (i == maxrow-1) or (j == maxcol-1) )
                        val = 0;
                    
                    u_next[ U_OFFSET(i,j) ] = val;
                }
            }
    
            auto stop = std::chrono::high_resolution_clock::now();
    
            std::chrono::duration<double, std::milli> ms = stop - start;
            time += ms.count();
            
            std::swap(u_prev, u_curr);
            std::swap(u_curr, u_next);
    
    #ifdef HAVE_SILO
    #ifdef SAVE_TIMESTEPS
            if ( (iter%100) == 0 )
            {
                //#pragma acc update self(u_curr[0:nelem])
                std::stringstream ss;
                ss << "wave_openacc_" << iter << ".silo";
                visit_dump(wec.g_curr, ss.str());
            }
    #endif /* SAVE_TIMESTEPS */
    #endif /* HAVE_SILO */
    
    #ifdef SAVE_ITERTIME
            ofs << i << " " << ms.count() << std::endl;
    #endif /* SAVE_ITERTIME */
        }
        
        std::cout << "[Wave][OpenACC] Iteration Time: " << time/wec.maxiter << "ms" << std::endl;
        std::cout << "[Wave][OpenACC] Wall Time: " << time << "ms" << std::endl;
        
        double itertime = time/wec.maxiter;
        double gflops_s = 58*(maxrow*maxcol)/(1e6*itertime);
        std::cout << "[Wave][OpenACC] GFlops/s: " << gflops_s << std::endl;
    
        size_t kernel_bytes = 3*sizeof(T)*(maxrow*maxcol);
        double gbytes_s = kernel_bytes/(1e6*itertime);
        std::cout << "[Wave][OpenACC] Bandwidth: " << gbytes_s << "GB/s" << std::endl;
        
    
    #ifdef HAVE_SILO
        visit_dump(wec.g_curr, "wave_openacc_lastiter.silo");
    #endif /* HAVE_SILO */
    
        return time/wec.maxiter;
    }
    
    int main(void)
    {
    #ifdef SINGLE_PRECISION
        using T = float;
        std::cout << "Precision: single" << std::endl;
    #else
        using T = double;
        std::cout << "Precision: double" << std::endl;
    #endif
    
        double time;
    
    	for (size_t sz = 256; sz <= 256; sz *= 2)
        {
            std::cout << sz << std::endl;
            wave_equation_context<T> wec(sz, sz, 1, 0.1, 0.0001, 5000);
    
            wec.init();
            time = solve_openacc(wec);
        }
    
        return 0;
    }