Select Git revision
OCCVertex.cpp
-
Christophe Geuzaine authoredChristophe Geuzaine authored
gpu_support.hpp 14.34 KiB
#pragma once
#include <cstring>
/* GPU api wrapper, allows to bail out quickly from HIP in case
* things go the wrong way with it */
#ifdef GPU_USE_HIP
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-result"
#pragma GCC diagnostic ignored "-Wshadow"
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Weverything"
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#pragma clang diagnostic pop
#pragma GCC diagnostic pop
#else /* GPU_USE_HIP */
#include <cuda_runtime.h>
#endif /* GPU_USE_HIP */
#ifdef GPU_USE_HIP
#define gpuDeviceSynchronize hipDeviceSynchronize
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#define gpuMemcpyKind hipMemcpyKind
#define gpuMemcpyHostToHost hipMemcpyHostToHost
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define gpuMemcpyDefault hipMemcpyDefault
#define gpuTextureDesc hipTextureDesc
#define gpuResourceDesc hipResourceDesc
#ifdef DISABLE_TEXTURE_CACHE
#define gpuTextureObject_t double *
#else
#define gpuTextureObject_t hipTextureObject_t
#endif
#define gpuCreateTextureObject hipCreateTextureObject
#define gpuDestroyTextureObject hipDestroyTextureObject
#define gpuAddressModeClamp hipAddressModeClamp
#define gpuReadModeElementType hipReadModeElementType
#define gpuResourceTypeLinear hipResourceTypeLinear
#define gpuChannelFormatKindUnsigned hipChannelFormatKindUnsigned
#define gpuEvent_t hipEvent_t
#define gpuEventCreate hipEventCreate
#define gpuEventDestroy hipEventDestroy
#define gpuEventRecord hipEventRecord
#define gpuEventSynchronize hipEventSynchronize
#define gpuEventElapsedTime hipEventElapsedTime
#define gpuStream_t hipStream_t
#define gpuStreamCreate hipStreamCreate
#define gpuStreamDestroy hipStreamDestroy
#define gpuStreamSynchronize hipStreamSynchronize
#define gpuMallocHost hipHostMalloc
#define gpuFreeHost hipHostFree
#define gpuMemcpyAsync hipMemcpyAsync
#else /* GPU_USE_HIP */
#define gpuDeviceSynchronize cudaDeviceSynchronize
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#define gpuMemcpyKind cudaMemcpyKind
#define gpuMemcpyHostToHost cudaMemcpyHostToHost
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
#define gpuMemcpyDefault cudaMemcpyDefault
#define gpuTextureDesc cudaTextureDesc
#define gpuResourceDesc cudaResourceDesc
#ifdef DISABLE_TEXTURE_CACHE
#define gpuTextureObject_t double *
#else
#define gpuTextureObject_t cudaTextureObject_t
#endif
#define gpuCreateTextureObject cudaCreateTextureObject
#define gpuDestroyTextureObject cudaDestroyTextureObject
#define gpuAddressModeClamp cudaAddressModeClamp
#define gpuReadModeElementType cudaReadModeElementType
#define gpuResourceTypeLinear cudaResourceTypeLinear
#define gpuChannelFormatKindUnsigned cudaChannelFormatKindUnsigned
#define gpuEvent_t cudaEvent_t
#define gpuEventCreate cudaEventCreate
#define gpuEventDestroy cudaEventDestroy
#define gpuEventRecord cudaEventRecord
#define gpuEventSynchronize cudaEventSynchronize
#define gpuEventElapsedTime cudaEventElapsedTime
#define gpuStream_t cudaStream_t
#define gpuStreamCreate cudaStreamCreate
#define gpuStreamDestroy cudaStreamDestroy
#define gpuStreamSynchronize cudaStreamSynchronize
#define gpuMallocHost cudaMallocHost
#define gpuFreeHost cudaFreeHost
#define gpuMemcpyAsync cudaMemcpyAsync
#endif /* GPU_USE_HIP */
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-parameter"
inline void
checkGPU(gpuError_t result)
{
#ifdef GPU_DEBUG
if (result != gpuSuccess) {
#ifdef GPU_USE_HIP
fprintf(stderr, "HIP Runtime Error: %s\n", hipGetErrorString(result));
#else /* GPU_USE_HIP */
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
#endif /* GPU_USE_HIP */
assert(result == gpuSuccess);
}
#endif /* GPU_DEBUG */
// return result;
}
#pragma clang diagnostic pop
inline gpuError_t
gpuMalloc(void **ptr, size_t size)
{
#ifdef GPU_USE_HIP
return hipMalloc(ptr, size);
#else
return cudaMalloc(ptr, size);
#endif
}
inline gpuError_t
gpuFree(void *ptr)
{
#ifdef GPU_USE_HIP
return hipFree(ptr);
#else
return cudaFree(ptr);
#endif
}
inline gpuError_t
gpuMemset(void *ptr, int value, size_t size)
{
#ifdef GPU_USE_HIP
return hipMemset(ptr, value, size);
#else
return cudaMemset(ptr, value, size);
#endif
}
inline gpuError_t
gpuMemcpy(void *dst, const void *src, size_t size, gpuMemcpyKind kind)
{
#ifdef GPU_USE_HIP
return hipMemcpy(dst, src, size, kind);
#else
return cudaMemcpy(dst, src, size, kind);
#endif
}
#if defined(__NVCC__) || defined(__HIP__)
static __inline__ __device__ double
fetch_tex(gpuTextureObject_t tex, int32_t ofs)
{
#ifdef DISABLE_TEXTURE_CACHE
return tex[ofs];
#else
uint2 val = tex1Dfetch<uint2>(tex, ofs);
return __hiloint2double(val.y, val.x);
#endif
}
#endif
class stream
{
gpuStream_t m_stream;
public:
stream()
{
checkGPU( gpuStreamCreate(&m_stream) );
}
stream(const stream&) = delete;
stream(stream&&) = delete;
stream& operator=(const stream&) = delete;
stream& operator=(stream&&) = delete;
~stream()
{
checkGPU( gpuStreamDestroy(m_stream) );
}
operator gpuStream_t() const
{
return m_stream;
}
void wait(void) const
{
checkGPU( gpuStreamSynchronize(m_stream) );
}
};
template<typename T>
class pinned_vector
{
T * m_ptr;
size_t m_size;
public:
pinned_vector()
: m_ptr(nullptr), m_size(0)
{}
pinned_vector(size_t size)
: m_ptr(nullptr), m_size(0)
{
(void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) );
memset(m_ptr, '\0', size*sizeof(double));
m_size = size;
}
pinned_vector(const pinned_vector&) = delete;
pinned_vector(pinned_vector&&) = delete;
pinned_vector& operator=(const pinned_vector&) = delete;
pinned_vector& operator=(pinned_vector&&) = delete;
void resize(size_t size)
{
if (size == m_size)
return;
if (m_ptr != nullptr)
(void) checkGPU(gpuFreeHost(m_ptr));
(void)checkGPU( gpuMallocHost((void**)&m_ptr, size*sizeof(T)) );
memset(m_ptr, '\0', size*sizeof(double));
m_size = size;
}
void zero(void)
{
memset(m_ptr, '\0', m_size*sizeof(double));
}
size_t size(void) const
{
return m_size;
}
T* data(void)
{
return m_ptr;
}
const T* data(void) const
{
return m_ptr;
}
T operator[](size_t pos) const
{
return m_ptr[pos];
}
T& operator[](size_t pos)
{
return m_ptr[pos];
}
};
template<typename T>
class device_vector
{
T * m_devptr;
size_t m_size;
public:
device_vector()
: m_devptr(nullptr), m_size(0)
{}
device_vector(size_t size)
: m_devptr(nullptr), m_size(0)
{
(void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
(void)checkGPU( gpuMemset(m_devptr, 0, size*sizeof(T)) );
m_size = size;
}
device_vector(const T *src, size_t size)
: m_devptr(nullptr), m_size(0)
{
copyin(src, size);
}
device_vector(const device_vector& other)
{
(void)checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
(void)checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
m_size = other.m_size;
}
device_vector(device_vector&& other)
{
m_devptr = other.m_devptr;
m_size = other.m_size;
other.m_devptr = nullptr;
other.m_size = 0;
}
void copyin(const T *src, size_t size)
{
if (m_devptr != nullptr)
(void) gpuFree(m_devptr);
(void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
(void)checkGPU( gpuMemcpy(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice) );
m_size = size;
}
void copyin(const T *src, size_t size, const stream& st)
{
assert(size == m_size);
(void)checkGPU( gpuMemcpyAsync(m_devptr, src, size*sizeof(T), gpuMemcpyHostToDevice, st) );
}
void copyin(const T *src, size_t start, size_t num, const stream& st)
{
assert(start < m_size);
assert(start+num <= m_size);
const T *c_begin = src + start;
T *d_begin = m_devptr + start;
(void)checkGPU( gpuMemcpyAsync(d_begin, c_begin, num*sizeof(T), gpuMemcpyHostToDevice, st) );
}
void copyout(T *dst) const
{
if (m_devptr == nullptr)
return;
(void)checkGPU( gpuMemcpy(dst, m_devptr, m_size*sizeof(T), gpuMemcpyDeviceToHost) );
}
void resize(size_t size)
{
if (size == m_size)
return;
if (m_devptr != nullptr)
(void) gpuFree(m_devptr);
(void)checkGPU( gpuMalloc((void**)&m_devptr, size*sizeof(T)) );
m_size = size;
}
device_vector& operator=(const device_vector& other)
{
if ( (m_devptr != nullptr) or (m_size != other.m_size) )
{
(void) gpuFree(m_devptr);
(void) checkGPU( gpuMalloc((void**)&m_devptr, other.m_size*sizeof(T)) );
}
(void) checkGPU( gpuMemcpy(m_devptr, other.m_devptr, other.m_size*sizeof(T), gpuMemcpyDeviceToDevice) );
m_size = other.m_size;
return *this;
}
device_vector& operator=(device_vector&& other)
{
m_devptr = other.m_devptr;
m_size = other.m_size;
other.m_devptr = nullptr;
other.m_size = 0;
return *this;
}
void zero(void)
{
(void) checkGPU( gpuMemset(m_devptr, 0, m_size*sizeof(T)) );
}
T* data()
{
return m_devptr;
}
const T* data() const
{
return m_devptr;
}
size_t size() const
{
return m_size;
}
~device_vector()
{
if (m_devptr)
(void)checkGPU(gpuFree(m_devptr));
}
};
template<typename T>
class texture_allocator;
template<>
class texture_allocator<double>
{
#ifndef DISABLE_TEXTURE_CACHE
gpuTextureDesc cTexDesc;
gpuResourceDesc cResDesc;
gpuTextureObject_t cTexObj;
#endif
double *d_texDataPtr;
size_t data_size;
public:
texture_allocator()
: d_texDataPtr(nullptr)
{}
texture_allocator(const double *data, size_t size)
: d_texDataPtr(nullptr)
{
init(data, size);
}
texture_allocator(texture_allocator&& other)
{
#ifndef DISABLE_TEXTURE_CACHE
cTexDesc = other.cTexDesc;
cResDesc = other.cResDesc;
cTexObj = other.cTexObj;
#endif
d_texDataPtr = other.d_texDataPtr;
other.d_texDataPtr = nullptr;
data_size = other.data_size;
other.data_size = 0;
}
texture_allocator(const texture_allocator&) = delete;
texture_allocator& operator=(const texture_allocator&) = delete;
void init(const double *data, size_t size)
{
if (d_texDataPtr)
return;
/* alloc space for data on device */
checkGPU( gpuMalloc((void**)&d_texDataPtr, size*sizeof(double)) );
/* copy data on device*/
if (data)
checkGPU( gpuMemcpy(d_texDataPtr, data, size*sizeof(double), gpuMemcpyHostToDevice) );
else
checkGPU( gpuMemset(d_texDataPtr, 0, size*sizeof(double)) );
#ifndef DISABLE_TEXTURE_CACHE
/* cudaTextureDesc */
memset(&cTexDesc, 0, sizeof(cTexDesc));
cTexDesc.normalizedCoords = 0;
cTexDesc.addressMode[0] = gpuAddressModeClamp;
cTexDesc.readMode = gpuReadModeElementType;
/* cudaResourceDesc */
memset(&cResDesc, 0, sizeof(cResDesc));
cResDesc.resType = gpuResourceTypeLinear;
cResDesc.res.linear.devPtr = d_texDataPtr;
cResDesc.res.linear.sizeInBytes = size * sizeof(double);
cResDesc.res.linear.desc.f = gpuChannelFormatKindUnsigned;
cResDesc.res.linear.desc.x = 32;
cResDesc.res.linear.desc.y = 32;
/* create texture */
checkGPU( gpuCreateTextureObject(&cTexObj, &cResDesc, &cTexDesc, nullptr) );
#endif
data_size = size;
}
gpuTextureObject_t get_texture(void) const
{
#ifdef DISABLE_TEXTURE_CACHE
return d_texDataPtr;
#else
return cTexObj;
#endif
}
double * data(void) const
{
return d_texDataPtr;
}
size_t size() const
{
return data_size;
}
~texture_allocator()
{
if (d_texDataPtr)
{
#ifndef DISABLE_TEXTURE_CACHE
checkGPU( gpuDestroyTextureObject(cTexObj) );
#endif
checkGPU( gpuFree(d_texDataPtr) );
}
d_texDataPtr = nullptr;
data_size = 0;
}
};
class timecounter_gpu
{
gpuEvent_t start;
gpuEvent_t stop;
public:
timecounter_gpu()
{
checkGPU( gpuEventCreate(&start) );
checkGPU( gpuEventCreate(&stop) );
}
~timecounter_gpu()
{
checkGPU( gpuEventDestroy(start) );
checkGPU( gpuEventDestroy(stop) );
}
void tic()
{
checkGPU( gpuEventRecord(start, 0) );
}
double toc()
{
checkGPU( gpuEventRecord(stop, 0) );
float elapsed;
checkGPU( gpuEventSynchronize(stop) );
checkGPU( gpuEventElapsedTime(&elapsed, start, stop) );
return double(elapsed)/1000.0;
}
};