Remove Atomic::Inc and Dec

The implementation for Atomic::Inc and Dec in our SYCL backend use compare and exchange, because they do not exist in SYCL standard. This turns out to be very inefficient. Although both CUDA and HIP have efficient implementation of atomicInc and atomicDec, we are removing Atomic::Inc and Dec from amrex to avoid surprises. In all cases that we have used these functions, they can be replaced with Atomic::Add.
AMReX-Codes · Jan 31, 2023 · 192532e · 192532e
1 parent 225c605
commit 192532e
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 79 deletions.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -542,8 +542,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 h.item->barrier(sycl::access::fence_space::local_space);
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
-                        (shared_counter, 20480u);
+                    unsigned int itag = Gpu::Atomic::Add<sycl::access::address_space::local_space>
+                        (shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;
@@ -570,7 +570,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 __syncthreads();
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x);
+                    unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;

diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
@@ -14,7 +14,6 @@ namespace Gpu { namespace Atomic {
 
 // For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
 // For LogicalOr and LogicalAnd, the data type is int.
-// For Inc and Dec, the data type is unsigned int.
 // For Exch and CAS, the data type is generic.
 // All these functions are non-atomic in host code!!!
 // If one needs them to be atomic in host code, use HostDevice::Atomic::*.  Currently only
@@ -456,72 +455,6 @@ namespace detail {
 #endif
     }
 
-////////////////////////////////////////////////////////////////////////
-//  Inc
-////////////////////////////////////////////////////////////////////////
-
-#ifdef AMREX_USE_DPCPP
-    template <sycl::access::address_space AS = sycl::access::address_space::global_space>
-    AMREX_FORCE_INLINE
-    unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__SYCL_DEVICE_ONLY__)
-        constexpr auto mo = sycl::memory_order::relaxed;
-        constexpr auto ms = sycl::memory_scope::device;
-        sycl::atomic_ref<unsigned int,mo,ms,AS> a{*m};
-        unsigned int oldi = a.load(), newi;
-        do {
-            newi = (oldi >= value) ? 0u : (oldi+1u);
-        } while (! a.compare_exchange_strong(oldi, newi));
-        return oldi;
-#else
-        auto const old = *m;
-        *m = (old >= value) ? 0u : (old+1u);
-        return old;
-#endif
-    }
-#else
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicInc(m, value);
-#else
-        auto const old = *m;
-        *m = (old >= value) ? 0u : (old+1u);
-        return old;
-#endif
-    }
-#endif
-
-////////////////////////////////////////////////////////////////////////
-//  Dec
-////////////////////////////////////////////////////////////////////////
-
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    unsigned int Dec (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicDec(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
-        constexpr auto mo = sycl::memory_order::relaxed;
-        constexpr auto ms = sycl::memory_scope::device;
-        constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic_ref<unsigned int,mo,ms,as> a{*m};
-        unsigned int oldi = a.load(), newi;
-        do {
-            newi = ((oldi == 0u) || (oldi > value)) ? value : (oldi-1u);
-        } while (! a.compare_exchange_strong(oldi, newi));
-        return oldi;
-#else
-        auto const old = *m;
-        *m = ((old == 0u) || (old > value)) ? value : (old-1u);
-        return old;
-#endif
-    }
-
 ////////////////////////////////////////////////////////////////////////
 //  Exch
 ////////////////////////////////////////////////////////////////////////

diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
@@ -436,8 +436,8 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
         if (gridDimx > 1) {
             int& virtual_block_id_shared = *((int*)(shared2+nwarps));
             if (threadIdxx == 0) {
-                unsigned int bid = Gpu::Atomic::Inc<sycl::access::address_space::global_space>
-                    (virtual_block_id_p, gridDimx);
+                unsigned int bid = Gpu::Atomic::Add<sycl::access::address_space::global_space>
+                    (virtual_block_id_p, 1u);
                 virtual_block_id_shared = bid;
             }
             gh.item->barrier(sycl::access::fence_space::local_space);
@@ -942,7 +942,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
         if (gridDim.x > 1) {
             int& virtual_block_id_shared = *((int*)(shared2+nwarps));
             if (threadIdx.x == 0) {
-                unsigned int bid = Gpu::Atomic::Inc(virtual_block_id_p, gridDim.x);
+                unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
                 virtual_block_id_shared = bid;
             }
             __syncthreads();

diff --git a/Src/Particle/AMReX_NeighborParticlesI.H b/Src/Particle/AMReX_NeighborParticlesI.H
@@ -1036,7 +1036,6 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
 
             Gpu::Buffer<unsigned int> np_boundary({0});
             unsigned int* p_np_boundary = np_boundary.data();
-            constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();
 
             AMREX_FOR_1D ( np_real, i,
             {
@@ -1065,7 +1064,7 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
                                 if (call_check_pair(check_pair, ptile_data, ptile_data, i, pperm[p])) {
                                     IntVect cell_ijk = getParticleCell(pstruct[pperm[p]], plo, dxi, domain);
                                     if (!box.contains(cell_ijk)) {
-                                        int loc = Gpu::Atomic::Inc(p_np_boundary, max_unsigned_int);
+                                        unsigned int loc = Gpu::Atomic::Add(p_np_boundary, 1u);
                                         p_boundary_particle_ids[loc] = i;
                                         isActualNeighbor = true;
                                         break;

diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
@@ -152,8 +152,6 @@ struct ParticleCopyPlan
         auto p_dst_box_counts = m_box_counts_d.dataPtr();
         auto getBucket = pc.BufferMap().getBucketFunctor();
 
-        constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();
-
         m_dst_indices.resize(num_levels);
         for (int lev = 0; lev < num_levels; ++lev)
         {
@@ -174,8 +172,8 @@ struct ParticleCopyPlan
                     if (dst_box >= 0)
                     {
                         int dst_lev = p_levs[i];
-                        int index = Gpu::Atomic::Inc(
-                            &p_dst_box_counts[getBucket(dst_lev, dst_box)], max_unsigned_int);
+                        int index = static_cast<int>(Gpu::Atomic::Add(
+                            &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1u));
                         p_dst_indices[i] = index;
                     }
                 });