Skip to content

Commit

Permalink
ParallelFor for Reduction (#1658)
Browse files Browse the repository at this point in the history
Add capability for `ParallelFor` to safely do reduction using `deviceReduceSum`,
`Min`, etc.  The user passes `Gpu::KernelInfo{}.setReduction(true)` to notify
`ParallelFor` that this is a parallel reduction, and gives `ParallelFor` a callable
that takes `Gpu::Handler`.   A `Gpu::Handler` is needed to call
 `deviceReduceSum`.

Also add `Gpu::Buffer` class, whose data pointer can be used as a device
destination for `deviceReduceSum`.  It also has a `copyToHost` method to
copy the device result back to the host. 

See `Tutorials/GPU/ParallelReduce` for examples of how to use `ParallelFor`
for reduction.

Also note that the reduction function is OpenMP CPU threads safe.  Thus the
same code can run on with OpenMP when it is not built for GPU.

Co-authored-by: Andrew Myers <atmyers2@gmail.com>
  • Loading branch information
WeiqunZhang and atmyers authored Dec 21, 2020
1 parent c8cdfa6 commit 10ed0e0
Show file tree
Hide file tree
Showing 21 changed files with 963 additions and 177 deletions.
14 changes: 7 additions & 7 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
{
int bid = h.item.get_group_linear_id();
int tid = h.item.get_local_id(0);
int icell = h.item.get_global_id(0);
int bid = h.item->get_group_linear_id();
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);

int t = 0;
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
Expand Down Expand Up @@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
{
int bid = h.item.get_group(0);
int tid = h.item.get_local_id(0);
int icell = h.item.get_global_id(0);
int bid = h.item->get_group(0);
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);

unsigned int* shared_counter = (unsigned int*)h.local;
if (tid == 0) {
*shared_counter = 0;
}
h.item.barrier(sycl::access::fence_space::local_space);
h.item->barrier(sycl::access::fence_space::local_space);

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
Expand Down
4 changes: 4 additions & 0 deletions Src/Base/AMReX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,10 @@ amrex::Finalize ()
void
amrex::Finalize (amrex::AMReX* pamrex)
{
#ifdef AMREX_USE_GPU
Gpu::synchronize();
#endif

AMReX::erase(pamrex);

BL_TINY_PROFILE_FINALIZE();
Expand Down
14 changes: 14 additions & 0 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,20 @@ public:
&& p.z >= smallend[2] && p.z <= bigend[2]);
}

//! Returns true if argument is contained within Box.
AMREX_GPU_HOST_DEVICE
#if (AMREX_SPACEDIM == 1)
bool contains (int i, int, int) const noexcept {
#elif (AMREX_SPACEDIM == 2)
bool contains (int i, int j, int) const noexcept {
#else
bool contains (int i, int j, int k) const noexcept {
#endif
return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
&& j >= smallend[1] && j <= bigend[1],
&& k >= smallend[2] && k <= bigend[2]);
}

/** \brief Returns true if argument is contained within Box.
* It is an error if the Boxes have different types.
*/
Expand Down
1 change: 1 addition & 0 deletions Src/Base/AMReX_Gpu.H
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} }
#include <AMReX_GpuAtomic.H>
#include <AMReX_GpuUtility.H>
#include <AMReX_GpuDevice.H>
#include <AMReX_GpuBuffer.H>
#include <AMReX_GpuAsyncArray.H>
#include <AMReX_GpuElixir.H>
#include <AMReX_GpuMemory.H>
Expand Down
11 changes: 8 additions & 3 deletions Src/Base/AMReX_GpuAsyncArray.H
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public:
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T));
}
#endif
Expand All @@ -51,7 +51,7 @@ public:
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
}
else
#endif
Expand All @@ -62,6 +62,11 @@ public:

~AsyncArray () { clear(); }

AsyncArray (AsyncArray const&) = delete;
AsyncArray (AsyncArray &&) = delete;
void operator= (AsyncArray const&) = delete;
void operator= (AsyncArray &&) = delete;

T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
void clear ()
Expand All @@ -88,7 +93,7 @@ public:
#else
// xxxxx DPCPP todo
Gpu::streamSynchronize();
The_Device_Arena()->free(d_data);
The_Arena()->free(d_data);
std::free(h_data);
#endif
}
Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_GpuAsyncArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extern "C" {
void* hp = pp[1];
std::free(hp);
std::free(p);
amrex::The_Device_Arena()->free(dp);
amrex::The_Arena()->free(dp);
}
}
#endif
Expand Down
45 changes: 45 additions & 0 deletions Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic {

}}

#ifdef AMREX_USE_GPU
// functors
namespace Gpu {
template <typename T>
struct AtomicAdd
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::AddNoRet(dest, source);
}
};

template <typename T>
struct AtomicMin
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::Min(dest, source);
}
};

template <typename T>
struct AtomicMax
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::Max(dest, source);
}
};

template <typename T>
struct AtomicLogicalAnd
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::LogicalAnd(dest, source);
}
};

template <typename T>
struct AtomicLogicalOr
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::LogicalOr(dest, source);
}
};
}
#endif

}
#endif
113 changes: 113 additions & 0 deletions Src/Base/AMReX_GpuBuffer.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#ifndef AMREX_GPU_DEVICE_BUFFER_H_
#define AMREX_GPU_DEVICE_BUFFER_H_
#include <AMReX_Config.H>

#include <AMReX_Arena.H>
#include <AMReX_TypeTraits.H>
#include <AMReX_GpuDevice.H>
#include <cstring>
#include <cstdlib>
#include <initializer_list>
#include <memory>

namespace amrex {
namespace Gpu {

template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0>
class Buffer
{
public:

Buffer (std::initializer_list<T> init)
: m_size(init.size())
{
if (m_size == 0) return;
#ifdef AMREX_USE_GPU
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
#else
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
#endif
std::memcpy(h_data, init.begin(), m_size*sizeof(T));
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
#ifdef AMREX_USE_DPCPP
if (Gpu::onNullStream()) Gpu::synchronize();
#endif
}
#endif
}

Buffer (T const* h_p, const std::size_t n)
: m_size(n)
{
if (m_size == 0) return;
#ifdef AMREX_USE_GPU
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
#else
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
#endif
std::memcpy(h_data, h_p, m_size*sizeof(T));
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
#ifdef AMREX_USE_DPCPP
if (Gpu::onNullStream()) Gpu::synchronize();
#endif
}
#endif
}

~Buffer () { clear(); }

Buffer (Buffer const&) = delete;
Buffer (Buffer &&) = delete;
void operator= (Buffer const&) = delete;
void operator= (Buffer &&) = delete;

T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }

T const* hostData () const noexcept { return h_data; }
T* hostDatat () noexcept { return h_data; }

std::size_t size () const noexcept { return m_size; }

void clear ()
{
#ifdef AMREX_USE_GPU
if (d_data) The_Arena()->free(d_data);
if (h_data) The_Pinned_Arena()->free(h_data);
#else
std::free(h_data);
#endif
d_data = nullptr;
h_data = nullptr;
}

T* copyToHost ()
{
#ifdef AMREX_USE_GPU
if (d_data)
{
Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
Gpu::streamSynchronize();
}
#endif
return h_data;
}

private:
std::size_t m_size;
T* d_data = nullptr;
T* h_data = nullptr;
};

}
}

#endif
Loading

0 comments on commit 10ed0e0

Please sign in to comment.