Skip to content

Commit

Permalink
Remove Atomic::Inc and Dec (AMReX-Codes#3117)
Browse files Browse the repository at this point in the history
The implementation for Atomic::Inc and Dec in our SYCL backend use
compare and exchange, because they do not exist in SYCL standard. This
turns out to be very inefficient. Although both CUDA and HIP have
efficient implementation of atomicInc and atomicDec, we are removing
Atomic::Inc and Dec from amrex to avoid surprises. In all cases that we
have used these functions, they can be replaced with Atomic::Add.
  • Loading branch information
WeiqunZhang authored Feb 1, 2023
1 parent eefe246 commit c09ed7d
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 81 deletions.
7 changes: 4 additions & 3 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,9 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
h.item->barrier(sycl::access::fence_space::local_space);

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
(shared_counter, 20480u);
unsigned int itag = Gpu::Atomic::Add<unsigned int,
sycl::access::address_space::local_space>
(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand All @@ -570,7 +571,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
__syncthreads();

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x);
unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand Down
90 changes: 21 additions & 69 deletions Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ namespace Gpu { namespace Atomic {

// For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
// For LogicalOr and LogicalAnd, the data type is int.
// For Inc and Dec, the data type is unsigned int.
// For Exch and CAS, the data type is generic.
// All these functions are non-atomic in host code!!!
// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
Expand Down Expand Up @@ -125,7 +124,11 @@ namespace detail {

#ifdef AMREX_USE_GPU

#ifdef AMREX_USE_DPCPP
template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
#else
template<class T>
#endif
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
T Add_device (T* const sum, T const value) noexcept
{
Expand All @@ -135,8 +138,7 @@ namespace detail {
#elif defined(__SYCL_DEVICE_ONLY__)
constexpr auto mo = sycl::memory_order::relaxed;
constexpr auto ms = sycl::memory_scope::device;
constexpr auto as = sycl::access::address_space::global_space;
sycl::atomic_ref<T,mo,ms,as> a{*sum};
sycl::atomic_ref<T,mo,ms,AS> a{*sum};
return a.fetch_add(value);
#else
amrex::ignore_unused(sum, value);
Expand Down Expand Up @@ -185,12 +187,20 @@ namespace detail {

#endif

#ifdef AMREX_USE_DPCPP
template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
#else
template<class T>
#endif
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
T Add (T* const sum, T const value) noexcept
{
#if AMREX_DEVICE_COMPILE
#ifdef AMREX_USE_DPCPP
return Add_device<T,AS>(sum, value);
#else
return Add_device(sum, value);
#endif
#else
auto old = *sum;
*sum += value;
Expand Down Expand Up @@ -260,12 +270,20 @@ namespace detail {
// AddNoRet
////////////////////////////////////////////////////////////////////////

#ifdef AMREX_USE_DPCPP
template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
#else
template<class T>
#endif
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void AddNoRet (T* const sum, T const value) noexcept
{
#if AMREX_DEVICE_COMPILE
#ifdef AMREX_USE_DPCPP
Add_device<T,AS>(sum, value);
#else
Add_device(sum, value);
#endif
#else
*sum += value;
#endif
Expand Down Expand Up @@ -456,72 +474,6 @@ namespace detail {
#endif
}

////////////////////////////////////////////////////////////////////////
// Inc
////////////////////////////////////////////////////////////////////////

#ifdef AMREX_USE_DPCPP
template <sycl::access::address_space AS = sycl::access::address_space::global_space>
AMREX_FORCE_INLINE
unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__SYCL_DEVICE_ONLY__)
constexpr auto mo = sycl::memory_order::relaxed;
constexpr auto ms = sycl::memory_scope::device;
sycl::atomic_ref<unsigned int,mo,ms,AS> a{*m};
unsigned int oldi = a.load(), newi;
do {
newi = (oldi >= value) ? 0u : (oldi+1u);
} while (! a.compare_exchange_strong(oldi, newi));
return oldi;
#else
auto const old = *m;
*m = (old >= value) ? 0u : (old+1u);
return old;
#endif
}
#else
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
return atomicInc(m, value);
#else
auto const old = *m;
*m = (old >= value) ? 0u : (old+1u);
return old;
#endif
}
#endif

////////////////////////////////////////////////////////////////////////
// Dec
////////////////////////////////////////////////////////////////////////

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
unsigned int Dec (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
return atomicDec(m, value);
#elif defined(__SYCL_DEVICE_ONLY__)
constexpr auto mo = sycl::memory_order::relaxed;
constexpr auto ms = sycl::memory_scope::device;
constexpr auto as = sycl::access::address_space::global_space;
sycl::atomic_ref<unsigned int,mo,ms,as> a{*m};
unsigned int oldi = a.load(), newi;
do {
newi = ((oldi == 0u) || (oldi > value)) ? value : (oldi-1u);
} while (! a.compare_exchange_strong(oldi, newi));
return oldi;
#else
auto const old = *m;
*m = ((old == 0u) || (old > value)) ? value : (old-1u);
return old;
#endif
}

////////////////////////////////////////////////////////////////////////
// Exch
////////////////////////////////////////////////////////////////////////
Expand Down
5 changes: 2 additions & 3 deletions Src/Base/AMReX_Scan.H
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
if (gridDimx > 1) {
int& virtual_block_id_shared = *((int*)(shared2+nwarps));
if (threadIdxx == 0) {
unsigned int bid = Gpu::Atomic::Inc<sycl::access::address_space::global_space>
(virtual_block_id_p, gridDimx);
unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
virtual_block_id_shared = bid;
}
gh.item->barrier(sycl::access::fence_space::local_space);
Expand Down Expand Up @@ -942,7 +941,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
if (gridDim.x > 1) {
int& virtual_block_id_shared = *((int*)(shared2+nwarps));
if (threadIdx.x == 0) {
unsigned int bid = Gpu::Atomic::Inc(virtual_block_id_p, gridDim.x);
unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
virtual_block_id_shared = bid;
}
__syncthreads();
Expand Down
3 changes: 1 addition & 2 deletions Src/Particle/AMReX_NeighborParticlesI.H
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,6 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)

Gpu::Buffer<unsigned int> np_boundary({0});
unsigned int* p_np_boundary = np_boundary.data();
constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();

AMREX_FOR_1D ( np_real, i,
{
Expand Down Expand Up @@ -1065,7 +1064,7 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
if (call_check_pair(check_pair, ptile_data, ptile_data, i, pperm[p])) {
IntVect cell_ijk = getParticleCell(pstruct[pperm[p]], plo, dxi, domain);
if (!box.contains(cell_ijk)) {
int loc = Gpu::Atomic::Inc(p_np_boundary, max_unsigned_int);
unsigned int loc = Gpu::Atomic::Add(p_np_boundary, 1u);
p_boundary_particle_ids[loc] = i;
isActualNeighbor = true;
break;
Expand Down
6 changes: 2 additions & 4 deletions Src/Particle/AMReX_ParticleCommunication.H
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,6 @@ struct ParticleCopyPlan
auto p_dst_box_counts = m_box_counts_d.dataPtr();
auto getBucket = pc.BufferMap().getBucketFunctor();

constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();

m_dst_indices.resize(num_levels);
for (int lev = 0; lev < num_levels; ++lev)
{
Expand All @@ -174,8 +172,8 @@ struct ParticleCopyPlan
if (dst_box >= 0)
{
int dst_lev = p_levs[i];
int index = Gpu::Atomic::Inc(
&p_dst_box_counts[getBucket(dst_lev, dst_box)], max_unsigned_int);
int index = static_cast<int>(Gpu::Atomic::Add(
&p_dst_box_counts[getBucket(dst_lev, dst_box)], 1u));
p_dst_indices[i] = index;
}
});
Expand Down

0 comments on commit c09ed7d

Please sign in to comment.