Remove Atomic::Inc and Dec (AMReX-Codes#3117)

The implementation for Atomic::Inc and Dec in our SYCL backend use compare and exchange, because they do not exist in SYCL standard. This turns out to be very inefficient. Although both CUDA and HIP have efficient implementation of atomicInc and atomicDec, we are removing Atomic::Inc and Dec from amrex to avoid surprises. In all cases that we have used these functions, they can be replaced with Atomic::Add.
mohuangrui · Feb 1, 2023 · c09ed7d · c09ed7d
1 parent eefe246
commit c09ed7d
Show file tree

Hide file tree

Showing 5 changed files with 30 additions and 81 deletions.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -542,8 +542,9 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 h.item->barrier(sycl::access::fence_space::local_space);
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
-                        (shared_counter, 20480u);
+                    unsigned int itag = Gpu::Atomic::Add<unsigned int,
+                                                         sycl::access::address_space::local_space>
+                        (shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;
@@ -570,7 +571,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 __syncthreads();
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Inc(shared_counter, blockDim.x);
+                    unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;

diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
@@ -14,7 +14,6 @@ namespace Gpu { namespace Atomic {
 
 // For Add, Min and Max, we support int, unsigned int, long, unsigned long long, float and double.
 // For LogicalOr and LogicalAnd, the data type is int.
-// For Inc and Dec, the data type is unsigned int.
 // For Exch and CAS, the data type is generic.
 // All these functions are non-atomic in host code!!!
 // If one needs them to be atomic in host code, use HostDevice::Atomic::*.  Currently only
@@ -125,7 +124,11 @@ namespace detail {
 
 #ifdef AMREX_USE_GPU
 
+#ifdef AMREX_USE_DPCPP
+    template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
+#else
     template<class T>
+#endif
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T Add_device (T* const sum, T const value) noexcept
     {
@@ -135,8 +138,7 @@ namespace detail {
 #elif defined(__SYCL_DEVICE_ONLY__)
         constexpr auto mo = sycl::memory_order::relaxed;
         constexpr auto ms = sycl::memory_scope::device;
-        constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic_ref<T,mo,ms,as> a{*sum};
+        sycl::atomic_ref<T,mo,ms,AS> a{*sum};
         return a.fetch_add(value);
 #else
         amrex::ignore_unused(sum, value);
@@ -185,12 +187,20 @@ namespace detail {
 
 #endif
 
+#ifdef AMREX_USE_DPCPP
+    template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
+#else
     template<class T>
+#endif
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     T Add (T* const sum, T const value) noexcept
     {
 #if AMREX_DEVICE_COMPILE
+#ifdef AMREX_USE_DPCPP
+        return Add_device<T,AS>(sum, value);
+#else
         return Add_device(sum, value);
+#endif
 #else
         auto old = *sum;
         *sum += value;
@@ -260,12 +270,20 @@ namespace detail {
 //  AddNoRet
 ////////////////////////////////////////////////////////////////////////
 
+#ifdef AMREX_USE_DPCPP
+    template<class T, sycl::access::address_space AS = sycl::access::address_space::global_space>
+#else
     template<class T>
+#endif
     AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
     void AddNoRet (T* const sum, T const value) noexcept
     {
 #if AMREX_DEVICE_COMPILE
+#ifdef AMREX_USE_DPCPP
+        Add_device<T,AS>(sum, value);
+#else
         Add_device(sum, value);
+#endif
 #else
         *sum += value;
 #endif
@@ -456,72 +474,6 @@ namespace detail {
 #endif
     }
 
-////////////////////////////////////////////////////////////////////////
-//  Inc
-////////////////////////////////////////////////////////////////////////
-
-#ifdef AMREX_USE_DPCPP
-    template <sycl::access::address_space AS = sycl::access::address_space::global_space>
-    AMREX_FORCE_INLINE
-    unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__SYCL_DEVICE_ONLY__)
-        constexpr auto mo = sycl::memory_order::relaxed;
-        constexpr auto ms = sycl::memory_scope::device;
-        sycl::atomic_ref<unsigned int,mo,ms,AS> a{*m};
-        unsigned int oldi = a.load(), newi;
-        do {
-            newi = (oldi >= value) ? 0u : (oldi+1u);
-        } while (! a.compare_exchange_strong(oldi, newi));
-        return oldi;
-#else
-        auto const old = *m;
-        *m = (old >= value) ? 0u : (old+1u);
-        return old;
-#endif
-    }
-#else
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    unsigned int Inc (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicInc(m, value);
-#else
-        auto const old = *m;
-        *m = (old >= value) ? 0u : (old+1u);
-        return old;
-#endif
-    }
-#endif
-
-////////////////////////////////////////////////////////////////////////
-//  Dec
-////////////////////////////////////////////////////////////////////////
-
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    unsigned int Dec (unsigned int* const m, unsigned int const value) noexcept
-    {
-#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \
-    defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP)
-        return atomicDec(m, value);
-#elif defined(__SYCL_DEVICE_ONLY__)
-        constexpr auto mo = sycl::memory_order::relaxed;
-        constexpr auto ms = sycl::memory_scope::device;
-        constexpr auto as = sycl::access::address_space::global_space;
-        sycl::atomic_ref<unsigned int,mo,ms,as> a{*m};
-        unsigned int oldi = a.load(), newi;
-        do {
-            newi = ((oldi == 0u) || (oldi > value)) ? value : (oldi-1u);
-        } while (! a.compare_exchange_strong(oldi, newi));
-        return oldi;
-#else
-        auto const old = *m;
-        *m = ((old == 0u) || (old > value)) ? value : (old-1u);
-        return old;
-#endif
-    }
-
 ////////////////////////////////////////////////////////////////////////
 //  Exch
 ////////////////////////////////////////////////////////////////////////

diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
@@ -436,8 +436,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
         if (gridDimx > 1) {
             int& virtual_block_id_shared = *((int*)(shared2+nwarps));
             if (threadIdxx == 0) {
-                unsigned int bid = Gpu::Atomic::Inc<sycl::access::address_space::global_space>
-                    (virtual_block_id_p, gridDimx);
+                unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
                 virtual_block_id_shared = bid;
             }
             gh.item->barrier(sycl::access::fence_space::local_space);
@@ -942,7 +941,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE, RetSum a_ret_sum = retSum)
         if (gridDim.x > 1) {
             int& virtual_block_id_shared = *((int*)(shared2+nwarps));
             if (threadIdx.x == 0) {
-                unsigned int bid = Gpu::Atomic::Inc(virtual_block_id_p, gridDim.x);
+                unsigned int bid = Gpu::Atomic::Add(virtual_block_id_p, 1u);
                 virtual_block_id_shared = bid;
             }
             __syncthreads();

diff --git a/Src/Particle/AMReX_NeighborParticlesI.H b/Src/Particle/AMReX_NeighborParticlesI.H
@@ -1036,7 +1036,6 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
 
             Gpu::Buffer<unsigned int> np_boundary({0});
             unsigned int* p_np_boundary = np_boundary.data();
-            constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();
 
             AMREX_FOR_1D ( np_real, i,
             {
@@ -1065,7 +1064,7 @@ selectActualNeighbors (CheckPair&& check_pair, int num_cells)
                                 if (call_check_pair(check_pair, ptile_data, ptile_data, i, pperm[p])) {
                                     IntVect cell_ijk = getParticleCell(pstruct[pperm[p]], plo, dxi, domain);
                                     if (!box.contains(cell_ijk)) {
-                                        int loc = Gpu::Atomic::Inc(p_np_boundary, max_unsigned_int);
+                                        unsigned int loc = Gpu::Atomic::Add(p_np_boundary, 1u);
                                         p_boundary_particle_ids[loc] = i;
                                         isActualNeighbor = true;
                                         break;

diff --git a/Src/Particle/AMReX_ParticleCommunication.H b/Src/Particle/AMReX_ParticleCommunication.H
@@ -152,8 +152,6 @@ struct ParticleCopyPlan
         auto p_dst_box_counts = m_box_counts_d.dataPtr();
         auto getBucket = pc.BufferMap().getBucketFunctor();
 
-        constexpr unsigned int max_unsigned_int = std::numeric_limits<unsigned int>::max();
-
         m_dst_indices.resize(num_levels);
         for (int lev = 0; lev < num_levels; ++lev)
         {
@@ -174,8 +172,8 @@ struct ParticleCopyPlan
                     if (dst_box >= 0)
                     {
                         int dst_lev = p_levs[i];
-                        int index = Gpu::Atomic::Inc(
-                            &p_dst_box_counts[getBucket(dst_lev, dst_box)], max_unsigned_int);
+                        int index = static_cast<int>(Gpu::Atomic::Add(
+                            &p_dst_box_counts[getBucket(dst_lev, dst_box)], 1u));
                         p_dst_indices[i] = index;
                     }
                 });