Skip to content

Commit

Permalink
ParallelFor for Reduction
Browse files Browse the repository at this point in the history
Add capability for ParallelFor to safely do reduction using deviceReduceSum,
Min, etc. The user passes Gpu::KernelInfo{}.setReduction(true) to notify
ParallelFor that this is a parallel reduction, and gives ParallelFor a
callable that takes Gpu::Handler. A Gpu::Handler is needed to call
deviceReduceSum.

Also add Gpu::Buffer class, whose data pointer can be used as a device
destination for deviceReduceSum. It also has a copyToHost method to copy the
device result back to the host.

See Tutorials/GPU/ParallelReduce for examples of how to use ParallelFor for
reduction.

Also note that the reduction function is OpenMP CPU threads safe. Thus the
same code can run on with OpenMP when it is not built for GPU.
  • Loading branch information
WeiqunZhang committed Dec 21, 2020
1 parent c8cdfa6 commit 66b0f32
Show file tree
Hide file tree
Showing 20 changed files with 905 additions and 169 deletions.
14 changes: 7 additions & 7 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
{
int bid = h.item.get_group_linear_id();
int tid = h.item.get_local_id(0);
int icell = h.item.get_global_id(0);
int bid = h.item->get_group_linear_id();
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);

int t = 0;
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
Expand Down Expand Up @@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
{
int bid = h.item.get_group(0);
int tid = h.item.get_local_id(0);
int icell = h.item.get_global_id(0);
int bid = h.item->get_group(0);
int tid = h.item->get_local_id(0);
int icell = h.item->get_global_id(0);

unsigned int* shared_counter = (unsigned int*)h.local;
if (tid == 0) {
*shared_counter = 0;
}
h.item.barrier(sycl::access::fence_space::local_space);
h.item->barrier(sycl::access::fence_space::local_space);

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
Expand Down
4 changes: 4 additions & 0 deletions Src/Base/AMReX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,10 @@ amrex::Finalize ()
void
amrex::Finalize (amrex::AMReX* pamrex)
{
#ifdef AMREX_USE_GPU
Gpu::synchronize();
#endif

AMReX::erase(pamrex);

BL_TINY_PROFILE_FINALIZE();
Expand Down
14 changes: 14 additions & 0 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,20 @@ public:
&& p.z >= smallend[2] && p.z <= bigend[2]);
}

//! Returns true if argument is contained within Box.
AMREX_GPU_HOST_DEVICE
#if (AMREX_SPACEDIM == 1)
bool contains (int i, int, int) const noexcept {
#elif (AMREX_SPACEDIM == 2)
bool contains (int i, int j, int) const noexcept {
#else
bool contains (int i, int j, int k) const noexcept {
#endif
return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
&& j >= smallend[1] && j <= bigend[1],
&& k >= smallend[2] && k <= bigend[2]);
}

/** \brief Returns true if argument is contained within Box.
* It is an error if the Boxes have different types.
*/
Expand Down
1 change: 1 addition & 0 deletions Src/Base/AMReX_Gpu.H
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} }
#include <AMReX_GpuAtomic.H>
#include <AMReX_GpuUtility.H>
#include <AMReX_GpuDevice.H>
#include <AMReX_GpuBuffer.H>
#include <AMReX_GpuAsyncArray.H>
#include <AMReX_GpuElixir.H>
#include <AMReX_GpuMemory.H>
Expand Down
11 changes: 8 additions & 3 deletions Src/Base/AMReX_GpuAsyncArray.H
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public:
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T));
}
#endif
Expand All @@ -51,7 +51,7 @@ public:
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
}
else
#endif
Expand All @@ -62,6 +62,11 @@ public:

~AsyncArray () { clear(); }

AsyncArray (AsyncArray const&) = delete;
AsyncArray (AsyncArray &&) = delete;
void operator= (AsyncArray const&) = delete;
void operator= (AsyncArray &&) = delete;

T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
void clear ()
Expand All @@ -88,7 +93,7 @@ public:
#else
// xxxxx DPCPP todo
Gpu::streamSynchronize();
The_Device_Arena()->free(d_data);
The_Arena()->free(d_data);
std::free(h_data);
#endif
}
Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_GpuAsyncArray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extern "C" {
void* hp = pp[1];
std::free(hp);
std::free(p);
amrex::The_Device_Arena()->free(dp);
amrex::The_Arena()->free(dp);
}
}
#endif
Expand Down
45 changes: 45 additions & 0 deletions Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic {

}}

#ifdef AMREX_USE_GPU
// functors
namespace Gpu {
template <typename T>
struct AtomicAdd
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::AddNoRet(dest, source);
}
};

template <typename T>
struct AtomicMin
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::Min(dest, source);
}
};

template <typename T>
struct AtomicMax
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::Max(dest, source);
}
};

template <typename T>
struct AtomicLogicalAnd
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::LogicalAnd(dest, source);
}
};

template <typename T>
struct AtomicLogicalOr
{
AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
Gpu::Atomic::LogicalOr(dest, source);
}
};
}
#endif

}
#endif
113 changes: 113 additions & 0 deletions Src/Base/AMReX_GpuBuffer.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#ifndef AMREX_GPU_DEVICE_BUFFER_H_
#define AMREX_GPU_DEVICE_BUFFER_H_
#include <AMReX_Config.H>

#include <AMReX_Arena.H>
#include <AMReX_TypeTraits.H>
#include <AMReX_GpuDevice.H>
#include <cstring>
#include <cstdlib>
#include <initializer_list>
#include <memory>

namespace amrex {
namespace Gpu {

template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0>
class Buffer
{
public:

Buffer (std::initializer_list<T> init)
: m_size(init.size())
{
if (m_size == 0) return;
#ifdef AMREX_USE_GPU
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
#else
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
#endif
std::memcpy(h_data, init.begin(), m_size*sizeof(T));
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
#ifdef AMREX_USE_DPCPP
if (Gpu::onNullStream()) Gpu::synchronize();
#endif
}
#endif
}

Buffer (T const* h_p, const std::size_t n)
: m_size(n)
{
if (m_size == 0) return;
#ifdef AMREX_USE_GPU
h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
#else
h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
#endif
std::memcpy(h_data, h_p, m_size*sizeof(T));
#ifdef AMREX_USE_GPU
if (Gpu::inLaunchRegion())
{
d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
#ifdef AMREX_USE_DPCPP
if (Gpu::onNullStream()) Gpu::synchronize();
#endif
}
#endif
}

~Buffer () { clear(); }

Buffer (Buffer const&) = delete;
Buffer (Buffer &&) = delete;
void operator= (Buffer const&) = delete;
void operator= (Buffer &&) = delete;

T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }

T const* hostData () const noexcept { return h_data; }
T* hostDatat () noexcept { return h_data; }

std::size_t size () const noexcept { return m_size; }

void clear ()
{
#ifdef AMREX_USE_GPU
if (d_data) The_Arena()->free(d_data);
if (h_data) The_Pinned_Arena()->free(h_data);
#else
std::free(h_data);
#endif
d_data = nullptr;
h_data = nullptr;
}

T* copyToHost ()
{
#ifdef AMREX_USE_GPU
if (d_data)
{
Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
Gpu::streamSynchronize();
}
#endif
return h_data;
}

private:
std::size_t m_size;
T* d_data = nullptr;
T* h_data = nullptr;
};

}
}

#endif
Loading

0 comments on commit 66b0f32

Please sign in to comment.