Skip to content

Commit

Permalink
hip: device and streams
Browse files Browse the repository at this point in the history
  • Loading branch information
WeiqunZhang committed May 29, 2019
1 parent e3840f9 commit d89f895
Show file tree
Hide file tree
Showing 13 changed files with 241 additions and 144 deletions.
6 changes: 3 additions & 3 deletions Src/Base/AMReX_BLFort.H
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
{ \
dim3 numBlocks, numThreads; \
amrex::Gpu::Device::c_threads_and_blocks(box.loVect(), box.hiVect(), numBlocks, numThreads); \
function<<<numBlocks, numThreads, 0, amrex::Gpu::Device::cudaStream()>>>(__VA_ARGS__); \
function<<<numBlocks, numThreads, 0, amrex::Gpu::Device::gpuStream()>>>(__VA_ARGS__); \
AMREX_GPU_ERROR_CHECK(); \
}

Expand Down Expand Up @@ -90,14 +90,14 @@
dim3 function##numBlocks, function##numThreads; \
amrex::Gpu::Device::grid_stride_threads_and_blocks(function##numBlocks, function##numThreads); \
AMREX_GPU_SAFE_CALL(cudaFuncSetAttribute(&cuda_##function, cudaFuncAttributePreferredSharedMemoryCarveout, 0)); \
cuda_##function<<<function##numBlocks, function##numThreads, 0, amrex::Gpu::Device::cudaStream()>>> \
cuda_##function<<<function##numBlocks, function##numThreads, 0, amrex::Gpu::Device::gpuStream()>>> \

#else

#define AMREX_DEVICE_LAUNCH(function) \
dim3 function##numBlocks, function##numThreads; \
amrex::Gpu::Device::grid_stride_threads_and_blocks(function##numBlocks, function##numThreads); \
cuda_##function<<<function##numBlocks, function##numThreads, 0, amrex::Gpu::Device::cudaStream()>>> \
cuda_##function<<<function##numBlocks, function##numThreads, 0, amrex::Gpu::Device::gpuStream()>>> \

#endif

Expand Down
6 changes: 3 additions & 3 deletions Src/Base/AMReX_BaseFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -1707,7 +1707,7 @@ BaseFab<T,Allocator>::prefetchToHost () const noexcept
std::size_t s = sizeof(T)*this->nvar*this->domain.numPts();
AMREX_GPU_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s,
cudaCpuDeviceId,
Cuda::Device::cudaStream()));
Gpu::Device::gpuStream()));
#endif
}

Expand All @@ -1718,8 +1718,8 @@ BaseFab<T,Allocator>::prefetchToDevice () const noexcept
#ifdef AMREX_USE_CUDA
std::size_t s = sizeof(T)*this->nvar*this->domain.numPts();
AMREX_GPU_SAFE_CALL(cudaMemPrefetchAsync(this->dptr, s,
Cuda::Device::deviceId(),
Cuda::Device::cudaStream()));
Gpu::Device::deviceId(),
Gpu::Device::gpuStream()));
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_CudaAllocators.H
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace amrex {
amrex::Abort();
}

const int device = Cuda::Device::deviceId();
const int device = Gpu::Device::deviceId();
AMREX_GPU_SAFE_CALL(cudaMemAdvise(result, n*sizeof(T),
cudaMemAdviseSetPreferredLocation, device));

Expand Down
6 changes: 3 additions & 3 deletions Src/Base/AMReX_CudaAsyncArray.H
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public:
{
d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
AMREX_GPU_SAFE_CALL(cudaMemcpyAsync(d_data, h_data, n*sizeof(T),
cudaMemcpyHostToDevice, Device::cudaStream()));
cudaMemcpyHostToDevice, Gpu::Device::gpuStream()));
}
#endif
}
Expand Down Expand Up @@ -67,7 +67,7 @@ public:
T** p = static_cast<T**>(std::malloc(2*sizeof(T*)));
p[0] = d_data;
p[1] = h_data;
AMREX_GPU_SAFE_CALL(cudaStreamAddCallback(Device::cudaStream(),
AMREX_GPU_SAFE_CALL(cudaStreamAddCallback(Gpu::Device::gpuStream(),
amrex_asyncarray_delete, p, 0));
}
}
Expand All @@ -87,7 +87,7 @@ public:
if (d_data)
{
AMREX_GPU_SAFE_CALL(cudaMemcpyAsync(h_p, d_data, n*sizeof(T),
cudaMemcpyDeviceToHost, Device::cudaStream()));
cudaMemcpyDeviceToHost, Gpu::Device::gpuStream()));
}
else
#endif
Expand Down
4 changes: 2 additions & 2 deletions Src/Base/AMReX_CudaAsyncFab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ AsyncFab::clear ()
if (Gpu::inLaunchRegion())
{
if (m_impl != nullptr) {
// CUDA 10 AMREX_CUDA_SAFE_CALL(cudaLaunchHostFunc(Device::cudaStream(), amrex_devicefab_delete, p));
AMREX_CUDA_SAFE_CALL(cudaStreamAddCallback(Device::cudaStream(),
// CUDA 10 AMREX_CUDA_SAFE_CALL(cudaLaunchHostFunc(Gpu::Device::gpuStream(), amrex_devicefab_delete, p));
AMREX_CUDA_SAFE_CALL(cudaStreamAddCallback(Gpu::Device::gpuStream(),
amrex_devicefab_delete,
m_impl, 0));
}
Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_CudaAsyncFabImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ AsyncFabImpl::copy_htod ()
m_cpu_fab_data = m_cpu_fab;
m_cpu_fab_data.setOwner(false);
AMREX_CUDA_SAFE_CALL(cudaMemcpyAsync(dest, &m_cpu_fab_data, sizeof(BaseFabData<Real>),
cudaMemcpyHostToDevice, Device::cudaStream()));
cudaMemcpyHostToDevice, Gpu::Device::gpuStream()));
}
else
{
Expand Down
32 changes: 21 additions & 11 deletions Src/Base/AMReX_CudaDevice.H
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,16 @@
#include <AMReX_GpuControl.H>

namespace amrex {
namespace Cuda {

#if defined(AMREX_USE_HIP)
using gpuStream_t = hipStream_t;
using gpuDeviceProp_t = hipDeviceProp_t;
#elif defined(AMREX_USE_CUDA)
using gpuStream_t = cudaStream_t;
using gpuDeviceProp_t = cudaDeviceProp;
#endif

namespace Gpu {

class Device
{
Expand All @@ -20,8 +29,8 @@ public:
static void Initialize ();
static void Finalize ();

#if defined(AMREX_USE_CUDA)
static cudaStream_t cudaStream () noexcept { return cuda_stream; }
#if defined(AMREX_USE_GPU)
static gpuStream_t gpuStream () noexcept { return gpu_stream; }
#endif
static void setStreamIndex (const int idx) noexcept;
static void resetStreamIndex () noexcept { setStreamIndex(-1); }
Expand All @@ -39,7 +48,7 @@ public:
static void mem_advise_set_preferred (void* p, const std::size_t sz, const int device);
static void mem_advise_set_readonly (void* p, const std::size_t sz);

#if defined(AMREX_USE_CUDA)
#ifdef AMREX_USE_GPU
static void setNumThreadsMin (int nx, int ny, int nz) noexcept;
static void n_threads_and_blocks (const long N, dim3& numBlocks, dim3& numThreads) noexcept;
static void c_comps_threads_and_blocks (const int* lo, const int* hi, const int comps,
Expand All @@ -60,20 +69,21 @@ public:

private:

static void initialize_cuda ();
static void initialize_gpu ();

static int device_id;
static int verbose;

#if defined(AMREX_USE_CUDA)
static constexpr int max_cuda_streams = 16;
static std::array<cudaStream_t,max_cuda_streams> cuda_streams;
static cudaStream_t cuda_stream;

#ifdef AMREX_USE_GPU
static constexpr int max_gpu_streams = 16;
static dim3 numThreadsMin;
static dim3 numBlocksOverride, numThreadsOverride;

static cudaDeviceProp device_prop;
static std::array<hipStream_t,max_gpu_streams> gpu_streams;
static gpuStream_t gpu_stream;
static gpuDeviceProp_t device_prop;

static int warp_size;
#endif
};

Expand Down
Loading

0 comments on commit d89f895

Please sign in to comment.