Skip to content
Snippets Groups Projects
Select Git revision
  • f4b4645146183eb4c618ee1e0a6e1b6a49424622
  • master default protected
  • devel
  • MC/hide_eigen_sol
  • compare_at_gauss_points
  • mixed_interface_fix
  • detect-gmsh-api-version
  • MC/mpi_support
  • MC/volsrcs
  • fix-gmsh-master
  • find-gmsh
  • MC/hipify
  • v0.3.0
  • v0.2.0
  • v0.1.0
15 results

gpu_support.hpp

Blame
  • gpu_support.hpp 14.34 KiB
    #pragma once
    
    #include <cstring>
    
    /* GPU api wrapper, allows to bail out quickly from HIP in case
     * things go the wrong way with it */
    #ifdef GPU_USE_HIP
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wunused-result"
    #pragma GCC diagnostic ignored "-Wshadow"
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Weverything"
    #include <hip/hip_runtime.h>
    #include <hip/hip_runtime_api.h>
    #pragma clang diagnostic pop
    #pragma GCC diagnostic pop
    #else /* GPU_USE_HIP */
    #include <cuda_runtime.h>
    #endif /* GPU_USE_HIP */
    
    #ifdef GPU_USE_HIP
    
    #define gpuDeviceSynchronize            hipDeviceSynchronize
    #define gpuError_t                      hipError_t
    #define gpuSuccess                      hipSuccess
    #define gpuMemcpyKind                   hipMemcpyKind
    #define gpuMemcpyHostToHost             hipMemcpyHostToHost     
    #define gpuMemcpyHostToDevice           hipMemcpyHostToDevice   
    #define gpuMemcpyDeviceToHost           hipMemcpyDeviceToHost   
    #define gpuMemcpyDeviceToDevice         hipMemcpyDeviceToDevice     
    #define gpuMemcpyDefault                hipMemcpyDefault
    #define gpuTextureDesc                  hipTextureDesc
    #define gpuResourceDesc                 hipResourceDesc
    #ifdef DISABLE_TEXTURE_CACHE
        #define gpuTextureObject_t          double *
    #else
        #define gpuTextureObject_t          hipTextureObject_t
    #endif
    #define gpuCreateTextureObject          hipCreateTextureObject
    #define gpuDestroyTextureObject         hipDestroyTextureObject
    #define gpuAddressModeClamp             hipAddressModeClamp
    #define gpuReadModeElementType          hipReadModeElementType
    #define gpuResourceTypeLinear           hipResourceTypeLinear
    #define gpuChannelFormatKindUnsigned    hipChannelFormatKindUnsigned
    #define gpuEvent_t                      hipEvent_t
    #define gpuEventCreate                  hipEventCreate
    #define gpuEventDestroy                 hipEventDestroy
    #define gpuEventRecord                  hipEventRecord
    #define gpuEventSynchronize             hipEventSynchronize
    #define gpuEventElapsedTime             hipEventElapsedTime
    #define gpuStream_t                     hipStream_t
    #define gpuStreamCreate                 hipStreamCreate
    #define gpuStreamDestroy                hipStreamDestroy
    #define gpuStreamSynchronize            hipStreamSynchronize
    #define gpuMallocHost                   hipHostMalloc
    #define gpuFreeHost                     hipHostFree
    #define gpuMemcpyAsync                  hipMemcpyAsync
    
    #else /* GPU_USE_HIP */
    
    #define gpuDeviceSynchronize            cudaDeviceSynchronize
    #define gpuError_t                      cudaError_t
    #define gpuSuccess                      cudaSuccess
    #define gpuMemcpyKind                   cudaMemcpyKind
    #define gpuMemcpyHostToHost             cudaMemcpyHostToHost     
    #define gpuMemcpyHostToDevice           cudaMemcpyHostToDevice   
    #define gpuMemcpyDeviceToHost           cudaMemcpyDeviceToHost   
    #define gpuMemcpyDeviceToDevice         cudaMemcpyDeviceToDevice     
    #define gpuMemcpyDefault                cudaMemcpyDefault 
    #define gpuTextureDesc                  cudaTextureDesc
    #define gpuResourceDesc                 cudaResourceDesc
    #ifdef DISABLE_TEXTURE_CACHE
        #define gpuTextureObject_t          double *
    #else
        #define gpuTextureObject_t          cudaTextureObject_t
    #endif
    #define gpuCreateTextureObject          cudaCreateTextureObject
    #define gpuDestroyTextureObject         cudaDestroyTextureObject
    #define gpuAddressModeClamp             cudaAddressModeClamp
    #define gpuReadModeElementType          cudaReadModeElementType
    #define gpuResourceTypeLinear           cudaResourceTypeLinear
    #define gpuChannelFormatKindUnsigned    cudaChannelFormatKindUnsigned
    #define gpuEvent_t                      cudaEvent_t
    #define gpuEventCreate                  cudaEventCreate
    #define gpuEventDestroy                 cudaEventDestroy
    #define gpuEventRecord                  cudaEventRecord
    #define gpuEventSynchronize             cudaEventSynchronize
    #define gpuEventElapsedTime             cudaEventElapsedTime
    #define gpuStream_t                     cudaStream_t
    #define gpuStreamCreate                 cudaStreamCreate
    #define gpuStreamDestroy                cudaStreamDestroy
    #define gpuStreamSynchronize            cudaStreamSynchronize
    #define gpuMallocHost                   cudaMallocHost
    #define gpuFreeHost                     cudaFreeHost
    #define gpuMemcpyAsync                  cudaMemcpyAsync
    #endif /* GPU_USE_HIP */
    
    #pragma clang diagnostic push
    #pragma clang diagnostic ignored "-Wunused-parameter"
    inline void
    checkGPU(gpuError_t result)
    {
    #ifdef GPU_DEBUG
        if (result != gpuSuccess) {
    #ifdef GPU_USE_HIP
            fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result));
    #else /* GPU_USE_HIP */
            fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
    #endif /* GPU_USE_HIP */
            assert(result == gpuSuccess);
        }
    #endif /* GPU_DEBUG */
    //    return result;
    }
    #pragma clang diagnostic pop
    
    inline gpuError_t
    gpuMalloc(void **ptr, size_t size)
    {
    #ifdef GPU_USE_HIP
        return hipMalloc(ptr, size);
    #else
        return cudaMalloc(ptr, size);
    #endif
    }
    
    inline gpuError_t
    gpuFree(void *ptr)
    {
    #ifdef GPU_USE_HIP
        return hipFree(ptr);
    #else
        return cudaFree(ptr);
    #endif
    }
    
    inline gpuError_t
    gpuMemset(void *ptr, int value, size_t size)
    {
    #ifdef GPU_USE_HIP
        return hipMemset(ptr, value, size);
    #else
        return cudaMemset(ptr, value, size);
    #endif   
    }
    
    inline gpuError_t
    gpuMemcpy(void *dst, const void *src, size_t size, gpuMemcpyKind kind)
    {
    #ifdef GPU_USE_HIP
        return hipMemcpy(dst, src, size, kind);
    #else
        return cudaMemcpy(dst, src, size, kind);
    #endif   
    }
    
    
    #if defined(__NVCC__) || defined(__HIP__)
    static __inline__ __device__ double
    fetch_tex(gpuTextureObject_t tex, int32_t ofs)
    {
    #ifdef DISABLE_TEXTURE_CACHE
        return tex[ofs];
    #else
        uint2 val = tex1Dfetch<uint2>(tex, ofs);
        return __hiloint2double(val.y, val.x);
    #endif
    }
    #endif
    
    class stream
    {
        gpuStream_t    m_stream;
    
    public:
        stream()
        {
            checkGPU( gpuStreamCreate(&m_stream) );
        }
    
        stream(const stream&) = delete;
        stream(stream&&) = delete;
        stream& operator=(const stream&) = delete;
        stream& operator=(stream&&) = delete;
    
        ~stream()
        {
            checkGPU( gpuStreamDestroy(m_stream) );
        }
    
        operator gpuStream_t() const
        {
            return m_stream;
        }
    
        void wait(void) const
        {
            checkGPU( gpuStreamSynchronize(m_stream) );
        }
    };
    
    template<typename T>
    class pinned_vector
    {
        T *     m_ptr;
        size_t  m_size;
    
    public:
        pinned_vector()
            : m_ptr(nullptr), m_size(0)
        {}
    
        pinned_vector(size_t size)
            : m_ptr(nullptr), m_size(0)
        {
            (void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) );
            memset(m_ptr, '\0', size*sizeof(double));
            m_size = size;
        }
    
        pinned_vector(const pinned_vector&) = delete;
        pinned_vector(pinned_vector&&) = delete;
        pinned_vector& operator=(const pinned_vector&) = delete;
        pinned_vector& operator=(pinned_vector&&) = delete;
    
        void resize(size_t size)
        {
            if (size == m_size)
                return;
    
            if (m_ptr != nullptr)
                (void) checkGPU(gpuFreeHost(m_ptr));
    
            (void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) );
            memset(m_ptr, '\0', size*sizeof(double));
            m_size = size;
        }
    
        void zero(void)
        {
            memset(m_ptr, '\0', m_size*sizeof(double));
        }
    
        size_t size(void) const
        {
            return m_size;
        }
    
        T* data(void)
        {
            return m_ptr;
        }
    
        const T* data(void) const
        {
            return m_ptr;
        }
    
        T operator[](size_t pos) const
        {
            return m_ptr[pos];
        }
    
        T& operator[](size_t pos)
        {
            return m_ptr[pos];
        }
    };
    
    template<typename T>
    class device_vector
    {
        T *     m_devptr;
        size_t  m_size;
    
    public:
        device_vector()
            : m_devptr(nullptr), m_size(0)
        {}
    
        device_vector(size_t size)
            : m_devptr(nullptr), m_size(0)
        {
            (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
            (void)checkGPU( gpuMemset(m_devptr, 0, size*sizeof(T)) );
            m_size = size;
        }
    
        device_vector(const T *src, size_t size)
            : m_devptr(nullptr), m_size(0)
        {
            copyin(src, size);
        }
    
        device_vector(const device_vector& other)
        {
            (void)checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
            (void)checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
            m_size = other.m_size;
        }
    
        device_vector(device_vector&& other)
        {
            m_devptr = other.m_devptr;
            m_size = other.m_size;
            other.m_devptr = nullptr;
            other.m_size = 0;
        }
    
        void copyin(const T *src, size_t size)
        {
            if (m_devptr != nullptr)
                (void) gpuFree(m_devptr);
    
            (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
            (void)checkGPU( gpuMemcpy(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice) );
            m_size = size;
        }
    
        void copyin(const T *src, size_t size, const stream& st)
        {
            assert(size == m_size);
            (void)checkGPU( gpuMemcpyAsync(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice, st) );
        }
    
        void copyin(const T *src, size_t start, size_t num, const stream& st)
        {
            assert(start < m_size);
            assert(start+num <= m_size);
            const T *c_begin = src + start;
            T *d_begin = m_devptr + start;
            (void)checkGPU( gpuMemcpyAsync(d_begin, c_begin, num*sizeof(T), gpuMemcpyHostToDevice, st) );
        }
    
        void copyout(T *dst) const
        {
            if (m_devptr == nullptr)
                return;
    
            (void)checkGPU( gpuMemcpy(dst, m_devptr, m_size*sizeof(T), gpuMemcpyDeviceToHost) );
        }
    
        void resize(size_t size)
        {
            if (size == m_size)
                return;
    
            if (m_devptr != nullptr)
                (void) gpuFree(m_devptr);
    
            (void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
            m_size = size;
        }
    
        device_vector& operator=(const device_vector& other)
        {
            if ( (m_devptr != nullptr) or (m_size != other.m_size) )
            {
                (void) gpuFree(m_devptr);
                (void) checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
            }
    
            (void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
            m_size = other.m_size;
            return *this;
        }
    
        device_vector& operator=(device_vector&& other)
        {
            m_devptr = other.m_devptr;
            m_size = other.m_size;
            other.m_devptr = nullptr;
            other.m_size = 0;
            return *this;
        }
    
        void zero(void)
        {
            (void) checkGPU( gpuMemset(m_devptr, 0, m_size*sizeof(T)) );
        }
    
        T* data()
        {
            return m_devptr;
        }
    
        const T* data() const
        {
            return m_devptr;
        }
    
        size_t size() const
        {
            return m_size;
        }
    
        ~device_vector()
        {
            if (m_devptr)
                (void)checkGPU(gpuFree(m_devptr));
        }
    };
    
    
    template<typename T>
    class texture_allocator;
    
    template<>
    class texture_allocator<double>
    {
    #ifndef DISABLE_TEXTURE_CACHE
        gpuTextureDesc          cTexDesc;
        gpuResourceDesc         cResDesc;
        gpuTextureObject_t      cTexObj;
    #endif
        double                  *d_texDataPtr;
        size_t                  data_size;
    
    public:
        texture_allocator()
            : d_texDataPtr(nullptr)
        {}
    
        texture_allocator(const double *data, size_t size)
            : d_texDataPtr(nullptr)
        {
            init(data, size);
        }
    
        texture_allocator(texture_allocator&& other)
        {
    #ifndef DISABLE_TEXTURE_CACHE
            cTexDesc            = other.cTexDesc;
            cResDesc            = other.cResDesc;
            cTexObj             = other.cTexObj;
    #endif
            d_texDataPtr        = other.d_texDataPtr;
            other.d_texDataPtr  = nullptr;
            data_size           = other.data_size;
            other.data_size     = 0;
        }
    
        texture_allocator(const texture_allocator&) = delete;
        texture_allocator& operator=(const texture_allocator&) = delete;
    
        void init(const double *data, size_t size)
        {
            if (d_texDataPtr)
                return;
    
            /* alloc space for data on device */
            checkGPU( gpuMalloc((void**)&d_texDataPtr, size*sizeof(double)) );
    
            /* copy data on device*/
            if (data)
                checkGPU( gpuMemcpy(d_texDataPtr, data, size*sizeof(double), gpuMemcpyHostToDevice) );
            else
                checkGPU( gpuMemset(d_texDataPtr, 0, size*sizeof(double)) );
    
    #ifndef DISABLE_TEXTURE_CACHE
            /* cudaTextureDesc */
            memset(&cTexDesc, 0, sizeof(cTexDesc));
            cTexDesc.normalizedCoords = 0;
            cTexDesc.addressMode[0] = gpuAddressModeClamp;
            cTexDesc.readMode = gpuReadModeElementType;
    
            /* cudaResourceDesc */
            memset(&cResDesc, 0, sizeof(cResDesc));
            cResDesc.resType = gpuResourceTypeLinear;
            cResDesc.res.linear.devPtr = d_texDataPtr;
            cResDesc.res.linear.sizeInBytes = size * sizeof(double);
            cResDesc.res.linear.desc.f = gpuChannelFormatKindUnsigned;
            cResDesc.res.linear.desc.x = 32;
            cResDesc.res.linear.desc.y = 32;
    
            /* create texture */
            checkGPU( gpuCreateTextureObject(&cTexObj, &cResDesc, &cTexDesc, nullptr) );
    #endif
            data_size = size;
        }
    
        gpuTextureObject_t get_texture(void) const
        {
    #ifdef DISABLE_TEXTURE_CACHE
            return d_texDataPtr;
    #else
            return cTexObj;
    #endif
        }
    
        double * data(void) const
        {
            return d_texDataPtr;
        }
    
        size_t size() const
        {
            return data_size;
        }
    
        ~texture_allocator()
        {
            if (d_texDataPtr)
            {
    #ifndef DISABLE_TEXTURE_CACHE
                checkGPU( gpuDestroyTextureObject(cTexObj) );
    #endif
                checkGPU( gpuFree(d_texDataPtr) );
            }
    
            d_texDataPtr = nullptr;
            data_size = 0;
        }
    };
    
    
    
    class timecounter_gpu
    {
        gpuEvent_t start;
        gpuEvent_t stop;
    
    public:
        timecounter_gpu()
        {
            checkGPU( gpuEventCreate(&start) );
            checkGPU( gpuEventCreate(&stop) );
        }
    
        ~timecounter_gpu()
        {
            checkGPU( gpuEventDestroy(start) );
            checkGPU( gpuEventDestroy(stop) );
        }
    
        void tic()
        {
            checkGPU( gpuEventRecord(start, 0) );
        }
    
        double toc()
        {
            checkGPU( gpuEventRecord(stop, 0) );
    
            float elapsed;
            checkGPU( gpuEventSynchronize(stop) );
            checkGPU( gpuEventElapsedTime(&elapsed, start, stop) );
            return double(elapsed)/1000.0;
        }
    };