ParallelFor for Reduction (#1658)

Add capability for `ParallelFor` to safely do reduction using `deviceReduceSum`, `Min`, etc. The user passes `Gpu::KernelInfo{}.setReduction(true)` to notify `ParallelFor` that this is a parallel reduction, and gives `ParallelFor` a callable that takes `Gpu::Handler`. A `Gpu::Handler` is needed to call `deviceReduceSum`. Also add `Gpu::Buffer` class, whose data pointer can be used as a device destination for `deviceReduceSum`. It also has a `copyToHost` method to copy the device result back to the host. See `Tutorials/GPU/ParallelReduce` for examples of how to use `ParallelFor` for reduction. Also note that the reduction function is OpenMP CPU threads safe. Thus the same code can run on with OpenMP when it is not built for GPU. Co-authored-by: Andrew Myers <atmyers2@gmail.com>
AMReX-Codes · Dec 21, 2020 · 10ed0e0 · 10ed0e0
1 parent c8cdfa6
commit 10ed0e0
Show file tree

Hide file tree

Showing 21 changed files with 963 additions and 177 deletions.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
                       Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
         {
-            int bid = h.item.get_group_linear_id();
-            int tid = h.item.get_local_id(0);
-            int icell = h.item.get_global_id(0);
+            int bid = h.item->get_group_linear_id();
+            int tid = h.item->get_local_id(0);
+            int icell = h.item->get_global_id(0);
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
             amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
             {
-                int bid = h.item.get_group(0);
-                int tid = h.item.get_local_id(0);
-                int icell = h.item.get_global_id(0);
+                int bid = h.item->get_group(0);
+                int tid = h.item->get_local_id(0);
+                int icell = h.item->get_global_id(0);
 
                 unsigned int* shared_counter = (unsigned int*)h.local;
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item.barrier(sycl::access::fence_space::local_space);
+                h.item->barrier(sycl::access::fence_space::local_space);
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
                     unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>

diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
@@ -586,6 +586,10 @@ amrex::Finalize ()
 void
 amrex::Finalize (amrex::AMReX* pamrex)
 {
+#ifdef AMREX_USE_GPU
+    Gpu::synchronize();
+#endif
+
     AMReX::erase(pamrex);
 
     BL_TINY_PROFILE_FINALIZE();

diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
@@ -221,6 +221,20 @@ public:
                          && p.z >= smallend[2] && p.z <= bigend[2]);
     }
 
+    //! Returns true if argument is contained within Box.
+    AMREX_GPU_HOST_DEVICE
+#if (AMREX_SPACEDIM == 1)
+    bool contains (int i, int, int) const noexcept {
+#elif (AMREX_SPACEDIM == 2)
+    bool contains (int i, int j, int) const noexcept {
+#else
+    bool contains (int i, int j, int k) const noexcept {
+#endif
+        return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
+                         && j >= smallend[1] && j <= bigend[1],
+                         && k >= smallend[2] && k <= bigend[2]);
+    }
+
     /** \brief Returns true if argument is contained within Box.
     * It is an error if the Boxes have different types.
     */

diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
@@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} }
 #include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuUtility.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_GpuBuffer.H>
 #include <AMReX_GpuAsyncArray.H>
 #include <AMReX_GpuElixir.H>
 #include <AMReX_GpuMemory.H>

diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
@@ -38,7 +38,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T));
         }
 #endif
@@ -51,7 +51,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
         }
         else
 #endif
@@ -62,6 +62,11 @@ public:
 
     ~AsyncArray () { clear(); }
 
+    AsyncArray (AsyncArray const&) = delete;
+    AsyncArray (AsyncArray &&) = delete;
+    void operator= (AsyncArray const&) = delete;
+    void operator= (AsyncArray &&) = delete;
+
     T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
     T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
     void clear ()
@@ -88,7 +93,7 @@ public:
 #else
                 // xxxxx DPCPP todo
                 Gpu::streamSynchronize();
-                The_Device_Arena()->free(d_data);
+                The_Arena()->free(d_data);
                 std::free(h_data);
 #endif
             }

diff --git a/Src/Base/AMReX_GpuAsyncArray.cpp b/Src/Base/AMReX_GpuAsyncArray.cpp
@@ -17,7 +17,7 @@ extern "C" {
         void* hp = pp[1];
         std::free(hp);
         std::free(p);
-        amrex::The_Device_Arena()->free(dp);
+        amrex::The_Arena()->free(dp);
     }
 }
 #endif

diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
@@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic {
 
 }}
 
+#ifdef AMREX_USE_GPU
+// functors
+namespace Gpu {
+    template <typename T>
+    struct AtomicAdd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::AddNoRet(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMin
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Min(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMax
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Max(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalAnd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalAnd(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalOr
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalOr(dest, source);
+        }
+    };
+}
+#endif
+
 }
 #endif
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
@@ -0,0 +1,113 @@
+#ifndef AMREX_GPU_DEVICE_BUFFER_H_
+#define AMREX_GPU_DEVICE_BUFFER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_TypeTraits.H>
+#include <AMReX_GpuDevice.H>
+#include <cstring>
+#include <cstdlib>
+#include <initializer_list>
+#include <memory>
+
+namespace amrex {
+namespace Gpu {
+
+template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0>
+class Buffer
+{
+public:
+
+    Buffer (std::initializer_list<T> init)
+        : m_size(init.size())
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, init.begin(), m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    Buffer (T const* h_p, const std::size_t n)
+        : m_size(n)
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, h_p, m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    ~Buffer () { clear(); }
+
+    Buffer (Buffer const&) = delete;
+    Buffer (Buffer &&) = delete;
+    void operator= (Buffer const&) = delete;
+    void operator= (Buffer &&) = delete;
+
+    T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
+    T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
+
+    T const* hostData () const noexcept { return h_data; }
+    T* hostDatat () noexcept { return h_data; }
+
+    std::size_t size () const noexcept { return m_size; }
+
+    void clear ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data) The_Arena()->free(d_data);
+        if (h_data) The_Pinned_Arena()->free(h_data);
+#else
+        std::free(h_data);
+#endif
+        d_data = nullptr;
+        h_data = nullptr;
+    }
+
+    T* copyToHost ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data)
+        {
+            Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
+            Gpu::streamSynchronize();
+        }
+#endif
+        return h_data;
+    }
+
+private:
+    std::size_t m_size;
+    T* d_data = nullptr;
+    T* h_data = nullptr;
+};
+
+}
+}
+
+#endif