ParallelFor for Reduction

Add capability for ParallelFor to safely do reduction using deviceReduceSum, Min, etc. The user passes Gpu::KernelInfo{}.setReduction(true) to notify ParallelFor that this is a parallel reduction, and gives ParallelFor a callable that takes Gpu::Handler. A Gpu::Handler is needed to call deviceReduceSum. Also add Gpu::Buffer class, whose data pointer can be used as a device destination for deviceReduceSum. It also has a copyToHost method to copy the device result back to the host. See Tutorials/GPU/ParallelReduce for examples of how to use ParallelFor for reduction. Also note that the reduction function is OpenMP CPU threads safe. Thus the same code can run on with OpenMP when it is not built for GPU.
AMReX-Codes · Dec 21, 2020 · 66b0f32 · 66b0f32
1 parent c8cdfa6
commit 66b0f32
Show file tree

Hide file tree

Showing 20 changed files with 905 additions and 169 deletions.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
                       Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
         {
-            int bid = h.item.get_group_linear_id();
-            int tid = h.item.get_local_id(0);
-            int icell = h.item.get_global_id(0);
+            int bid = h.item->get_group_linear_id();
+            int tid = h.item->get_local_id(0);
+            int icell = h.item->get_global_id(0);
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
             amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
             {
-                int bid = h.item.get_group(0);
-                int tid = h.item.get_local_id(0);
-                int icell = h.item.get_global_id(0);
+                int bid = h.item->get_group(0);
+                int tid = h.item->get_local_id(0);
+                int icell = h.item->get_global_id(0);
 
                 unsigned int* shared_counter = (unsigned int*)h.local;
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item.barrier(sycl::access::fence_space::local_space);
+                h.item->barrier(sycl::access::fence_space::local_space);
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
                     unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>

diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
@@ -586,6 +586,10 @@ amrex::Finalize ()
 void
 amrex::Finalize (amrex::AMReX* pamrex)
 {
+#ifdef AMREX_USE_GPU
+    Gpu::synchronize();
+#endif
+
     AMReX::erase(pamrex);
 
     BL_TINY_PROFILE_FINALIZE();

diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
@@ -221,6 +221,20 @@ public:
                          && p.z >= smallend[2] && p.z <= bigend[2]);
     }
 
+    //! Returns true if argument is contained within Box.
+    AMREX_GPU_HOST_DEVICE
+#if (AMREX_SPACEDIM == 1)
+    bool contains (int i, int, int) const noexcept {
+#elif (AMREX_SPACEDIM == 2)
+    bool contains (int i, int j, int) const noexcept {
+#else
+    bool contains (int i, int j, int k) const noexcept {
+#endif
+        return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
+                         && j >= smallend[1] && j <= bigend[1],
+                         && k >= smallend[2] && k <= bigend[2]);
+    }
+
     /** \brief Returns true if argument is contained within Box.
     * It is an error if the Boxes have different types.
     */

diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
@@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} }
 #include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuUtility.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_GpuBuffer.H>
 #include <AMReX_GpuAsyncArray.H>
 #include <AMReX_GpuElixir.H>
 #include <AMReX_GpuMemory.H>

diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
@@ -38,7 +38,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T));
         }
 #endif
@@ -51,7 +51,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
         }
         else
 #endif
@@ -62,6 +62,11 @@ public:
 
     ~AsyncArray () { clear(); }
 
+    AsyncArray (AsyncArray const&) = delete;
+    AsyncArray (AsyncArray &&) = delete;
+    void operator= (AsyncArray const&) = delete;
+    void operator= (AsyncArray &&) = delete;
+
     T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
     T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
     void clear ()
@@ -88,7 +93,7 @@ public:
 #else
                 // xxxxx DPCPP todo
                 Gpu::streamSynchronize();
-                The_Device_Arena()->free(d_data);
+                The_Arena()->free(d_data);
                 std::free(h_data);
 #endif
             }

diff --git a/Src/Base/AMReX_GpuAsyncArray.cpp b/Src/Base/AMReX_GpuAsyncArray.cpp
@@ -17,7 +17,7 @@ extern "C" {
         void* hp = pp[1];
         std::free(hp);
         std::free(p);
-        amrex::The_Device_Arena()->free(dp);
+        amrex::The_Arena()->free(dp);
     }
 }
 #endif

diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
@@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic {
 
 }}
 
+#ifdef AMREX_USE_GPU
+// functors
+namespace Gpu {
+    template <typename T>
+    struct AtomicAdd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::AddNoRet(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMin
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Min(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMax
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Max(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalAnd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalAnd(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalOr
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalOr(dest, source);
+        }
+    };
+}
+#endif
+
 }
 #endif
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
@@ -0,0 +1,113 @@
+#ifndef AMREX_GPU_DEVICE_BUFFER_H_
+#define AMREX_GPU_DEVICE_BUFFER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_TypeTraits.H>
+#include <AMReX_GpuDevice.H>
+#include <cstring>
+#include <cstdlib>
+#include <initializer_list>
+#include <memory>
+
+namespace amrex {
+namespace Gpu {
+
+template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0>
+class Buffer
+{
+public:
+
+    Buffer (std::initializer_list<T> init)
+        : m_size(init.size())
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, init.begin(), m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    Buffer (T const* h_p, const std::size_t n)
+        : m_size(n)
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, h_p, m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    ~Buffer () { clear(); }
+
+    Buffer (Buffer const&) = delete;
+    Buffer (Buffer &&) = delete;
+    void operator= (Buffer const&) = delete;
+    void operator= (Buffer &&) = delete;
+
+    T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
+    T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
+
+    T const* hostData () const noexcept { return h_data; }
+    T* hostDatat () noexcept { return h_data; }
+
+    std::size_t size () const noexcept { return m_size; }
+
+    void clear ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data) The_Arena()->free(d_data);
+        if (h_data) The_Pinned_Arena()->free(h_data);
+#else
+        std::free(h_data);
+#endif
+        d_data = nullptr;
+        h_data = nullptr;
+    }
+
+    T* copyToHost ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data)
+        {
+            Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
+            Gpu::streamSynchronize();
+        }
+#endif
+        return h_data;
+    }
+
+private:
+    std::size_t m_size;
+    T* d_data = nullptr;
+    T* h_data = nullptr;
+};
+
+}
+}
+
+#endif