Skip to content

Commit

Permalink
Remove Atomic::Inc and Dec
Browse files Browse the repository at this point in the history
The implementation for Atomic::Inc and Dec in our SYCL backend use compare
and exchange, because they do not exist in SYCL standard.  This turns out to
be very inefficient.  Although both CUDA and HIP have efficient
implementation of atomicInc and atomicDec, we are removing Atomic::Inc and
Dec from amrex to avoid surprises.  In all cases that we have used these
functions, they can be replaced with Atomic::Add.
  • Loading branch information
WeiqunZhang committed Jan 31, 2023
1 parent 225c605 commit 192532e
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 79 deletions.
6 changes: 3 additions & 3 deletions Src/AmrCore/AMReX_TagBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,8 +542,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
h.item->barrier(sycl::access::fence_space::local_space);

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
(shared_counter, 20480u);
unsigned int itag = Gpu::Atomic::Add<sycl::access::address_space::local_space>
(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand All @@ -570,7 +570,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
__syncthreads();

if (icell < ncells && tags[icell] != TagBox::CLEAR) {
unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x);
unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand Down
67 changes: 0 additions & 67 deletions Src/Base/AMReX_GpuAtomic.H
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ namespace Gpu { namespace Atomic {

// For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
// For LogicalOr and LogicalAnd, the data type is int.
// For Inc and Dec, the data type is unsigned int.
// For Exch and CAS, the data type is generic.
// All these functions are non-atomic in host code!!!
// If one needs them to be atomic in host code, use HostDevice::Atomic::*. Currently only
Expand Down Expand Up @@ -456,72 +455,6 @@ namespace detail {
#endif
}

////////////////////////////////////////////////////////////////////////
// Inc
////////////////////////////////////////////////////////////////////////

#ifdef AMREX_USE_DPCPP
template <sycl::access::address_space AS = sycl::access::address_space::global_space>
AMREX_FORCE_INLINE
unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__SYCL_DEVICE_ONLY__)
constexpr auto mo = sycl::memory_order::relaxed;
constexpr auto ms = sycl::memory_scope::device;
sycl::atomic_ref<unsigned int,mo,ms,AS> a{*m};
unsigned int oldi = a.load(), newi;
do {
newi = (oldi >= value) ? 0u : (oldi+1u);
} while (! a.compare_exchange_strong(oldi, newi));
return oldi;
#else
auto const old = *m;
*m = (old >= value) ? 0u : (old+1u);
return old;
#endif
}
#else
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
return atomicInc(m, value);
#else
auto const old = *m;
*m = (old >= value) ? 0u : (old+1u);
return old;
#endif
}
#endif

////////////////////////////////////////////////////////////////////////
// Dec
////////////////////////////////////////////////////////////////////////

AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
unsigned int Dec (unsigned int* const m, unsigned int const value) noexcept
{
#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
return atomicDec(m, value);
#elif defined(__SYCL_DEVICE_ONLY__)
constexpr auto mo = sycl::memory_order::relaxed;
constexpr auto ms = sycl::memory_scope::device;
constexpr auto as = sycl::access::address_space::global_space;
sycl::atomic_ref<unsigned int,mo,ms,as> a{*m};
unsigned int oldi = a.load(), newi;
do {
newi = ((oldi == 0u) || (oldi > value)) ? value : (oldi-1u);
} while (! a.compare_exchange_strong(oldi, newi));
return oldi;
#else
auto const old = *m;
*m = ((old == 0u) || (old > value)) ? value : (old-1u);
return old;
#endif
}

////////////////////////////////////////////////////////////////////////
// Exch
////////////////////////////////////////////////////////////////////////
Expand Down
6 changes: 3 additions & 3 deletions Src/Base/AMReX_Scan.H
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,8 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
if (gridDimx > 1) {
int& virtual_block_id_shared = *((int*)(shared2+nwarps));
if (threadIdxx == 0) {
unsigned int bid = Gpu::Atomic::Inc<sycl::access::address_space::global_space>
(virtual_block_id_p, gridDimx);
unsigned int bid = Gpu::Atomic::Add<sycl::access::address_space::global_space>
(virtual_block_id_p, 1u);
virtual_block_id_shared = bid;
}
gh.item->barrier(sycl::access::fence_space::local_space);
Expand Down Expand Up @@ -942,7 +942,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
if (gridDim.x > 1) {
int& virtual_block_id_shared = *((int*)(shared2+nwarps));
if (threadIdx.x == 0) {
unsigned int bid = Gpu::Atomic::Inc(virtual_block_id_p, gridDim.x);
unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
virtual_block_id_shared = bid;
}
__syncthreads();
Expand Down
3 changes: 1 addition & 2 deletions Src/Particle/AMReX_NeighborParticlesI.H
Original file line number Diff line number Diff line change
Expand Up @@ -1036,7 +1036,6 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)

Gpu::Buffer<unsigned int> np_boundary({0});
unsigned int* p_np_boundary = np_boundary.data();
constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();

AMREX_FOR_1D ( np_real, i,
{
Expand Down Expand Up @@ -1065,7 +1064,7 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
if (call_check_pair(check_pair, ptile_data, ptile_data, i, pperm[p])) {
IntVect cell_ijk = getParticleCell(pstruct[pperm[p]], plo, dxi, domain);
if (!box.contains(cell_ijk)) {
int loc = Gpu::Atomic::Inc(p_np_boundary, max_unsigned_int);
unsigned int loc = Gpu::Atomic::Add(p_np_boundary, 1u);
p_boundary_particle_ids[loc] = i;
isActualNeighbor = true;
break;
Expand Down
6 changes: 2 additions & 4 deletions Src/Particle/AMReX_ParticleCommunication.H
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,6 @@ struct ParticleCopyPlan
auto p_dst_box_counts = m_box_counts_d.dataPtr();
auto getBucket = pc.BufferMap().getBucketFunctor();

constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();

m_dst_indices.resize(num_levels);
for (int lev = 0; lev < num_levels; ++lev)
{
Expand All @@ -174,8 +172,8 @@ struct ParticleCopyPlan
if (dst_box >= 0)
{
int dst_lev = p_levs[i];
int index = Gpu::Atomic::Inc(
&p_dst_box_counts[getBucket(dst_lev, dst_box)], max_unsigned_int);
int index = static_cast<int>(Gpu::Atomic::Add(
&p_dst_box_counts[getBucket(dst_lev, dst_box)], 1u));
p_dst_indices[i] = index;
}
});
Expand Down

0 comments on commit 192532e

Please sign in to comment.