diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index f387a5f1c8e..c01073c43c9 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -439,9 +439,9 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
                       Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
         {
-            int bid = h.item.get_group_linear_id();
-            int tid = h.item.get_local_id(0);
-            int icell = h.item.get_global_id(0);
+            int bid = h.item->get_group_linear_id();
+            int tid = h.item->get_local_id(0);
+            int icell = h.item->get_global_id(0);
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -517,15 +517,15 @@ TagBoxArray::local_collate_gpu (Vector<IntVect>& v) const
             amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
             {
-                int bid = h.item.get_group(0);
-                int tid = h.item.get_local_id(0);
-                int icell = h.item.get_global_id(0);
+                int bid = h.item->get_group(0);
+                int tid = h.item->get_local_id(0);
+                int icell = h.item->get_global_id(0);
 
                 unsigned int* shared_counter = (unsigned int*)h.local;
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item.barrier(sycl::access::fence_space::local_space);
+                h.item->barrier(sycl::access::fence_space::local_space);
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
                     unsigned int itag = Gpu::Atomic::Inc<sycl::access::address_space::local_space>
diff --git a/Src/Base/AMReX.cpp b/Src/Base/AMReX.cpp
index f3feae49837..36a73e7fc4f 100644
--- a/Src/Base/AMReX.cpp
+++ b/Src/Base/AMReX.cpp
@@ -586,6 +586,10 @@ amrex::Finalize ()
 void
 amrex::Finalize (amrex::AMReX* pamrex)
 {
+#ifdef AMREX_USE_GPU
+    Gpu::synchronize();
+#endif
+
     AMReX::erase(pamrex);
 
     BL_TINY_PROFILE_FINALIZE();
diff --git a/Src/Base/AMReX_Box.H b/Src/Base/AMReX_Box.H
index 9e9ec893689..f0ff7bed3b0 100644
--- a/Src/Base/AMReX_Box.H
+++ b/Src/Base/AMReX_Box.H
@@ -221,6 +221,20 @@ public:
                          && p.z >= smallend[2] && p.z <= bigend[2]);
     }
 
+    //! Returns true if argument is contained within Box.
+    AMREX_GPU_HOST_DEVICE
+#if (AMREX_SPACEDIM == 1)
+    bool contains (int i, int, int) const noexcept {
+#elif (AMREX_SPACEDIM == 2)
+    bool contains (int i, int j, int) const noexcept {
+#else
+    bool contains (int i, int j, int k) const noexcept {
+#endif
+        return AMREX_D_TERM(i >= smallend[0] && i <= bigend[0],
+                         && j >= smallend[1] && j <= bigend[1],
+                         && k >= smallend[2] && k <= bigend[2]);
+    }
+
     /** \brief Returns true if argument is contained within Box.
     * It is an error if the Boxes have different types.
     */
diff --git a/Src/Base/AMReX_Gpu.H b/Src/Base/AMReX_Gpu.H
index c4db3d08a15..32a0b5a8461 100644
--- a/Src/Base/AMReX_Gpu.H
+++ b/Src/Base/AMReX_Gpu.H
@@ -22,6 +22,7 @@ namespace amrex { namespace Cuda {} }
 #include <AMReX_GpuAtomic.H>
 #include <AMReX_GpuUtility.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_GpuBuffer.H>
 #include <AMReX_GpuAsyncArray.H>
 #include <AMReX_GpuElixir.H>
 #include <AMReX_GpuMemory.H>
diff --git a/Src/Base/AMReX_GpuAsyncArray.H b/Src/Base/AMReX_GpuAsyncArray.H
index 262ae1a4ae5..46d8cd6a0e9 100644
--- a/Src/Base/AMReX_GpuAsyncArray.H
+++ b/Src/Base/AMReX_GpuAsyncArray.H
@@ -38,7 +38,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
             Gpu::htod_memcpy_async(d_data, h_data, n*sizeof(T));
         }
 #endif
@@ -51,7 +51,7 @@ public:
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion())
         {
-            d_data = static_cast<T*>(The_Device_Arena()->alloc(n*sizeof(T)));
+            d_data = static_cast<T*>(The_Arena()->alloc(n*sizeof(T)));
         }
         else
 #endif
@@ -62,6 +62,11 @@ public:
 
     ~AsyncArray () { clear(); }
 
+    AsyncArray (AsyncArray const&) = delete;
+    AsyncArray (AsyncArray &&) = delete;
+    void operator= (AsyncArray const&) = delete;
+    void operator= (AsyncArray &&) = delete;
+
     T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
     T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
     void clear ()
@@ -88,7 +93,7 @@ public:
 #else
                 // xxxxx DPCPP todo
                 Gpu::streamSynchronize();
-                The_Device_Arena()->free(d_data);
+                The_Arena()->free(d_data);
                 std::free(h_data);
 #endif
             }
diff --git a/Src/Base/AMReX_GpuAsyncArray.cpp b/Src/Base/AMReX_GpuAsyncArray.cpp
index 35355218b91..520de2f30b0 100644
--- a/Src/Base/AMReX_GpuAsyncArray.cpp
+++ b/Src/Base/AMReX_GpuAsyncArray.cpp
@@ -17,7 +17,7 @@ extern "C" {
         void* hp = pp[1];
         std::free(hp);
         std::free(p);
-        amrex::The_Device_Arena()->free(dp);
+        amrex::The_Arena()->free(dp);
     }
 }
 #endif
diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H
index 32305d58b6f..71d88994825 100644
--- a/Src/Base/AMReX_GpuAtomic.H
+++ b/Src/Base/AMReX_GpuAtomic.H
@@ -452,5 +452,50 @@ namespace HostDevice { namespace Atomic {
 
 }}
 
+#ifdef AMREX_USE_GPU
+// functors
+namespace Gpu {
+    template <typename T>
+    struct AtomicAdd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::AddNoRet(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMin
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Min(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicMax
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::Max(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalAnd
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalAnd(dest, source);
+        }
+    };
+
+    template <typename T>
+    struct AtomicLogicalOr
+    {
+        AMREX_GPU_DEVICE void operator() (T* const dest, T const source) noexcept {
+            Gpu::Atomic::LogicalOr(dest, source);
+        }
+    };
+}
+#endif
+
 }
 #endif
diff --git a/Src/Base/AMReX_GpuBuffer.H b/Src/Base/AMReX_GpuBuffer.H
new file mode 100644
index 00000000000..23b02c255ee
--- /dev/null
+++ b/Src/Base/AMReX_GpuBuffer.H
@@ -0,0 +1,113 @@
+#ifndef AMREX_GPU_DEVICE_BUFFER_H_
+#define AMREX_GPU_DEVICE_BUFFER_H_
+#include <AMReX_Config.H>
+
+#include <AMReX_Arena.H>
+#include <AMReX_TypeTraits.H>
+#include <AMReX_GpuDevice.H>
+#include <cstring>
+#include <cstdlib>
+#include <initializer_list>
+#include <memory>
+
+namespace amrex {
+namespace Gpu {
+
+template <typename T, typename std::enable_if<AMREX_IS_TRIVIALLY_COPYABLE(T),int>::type = 0>
+class Buffer
+{
+public:
+
+    Buffer (std::initializer_list<T> init)
+        : m_size(init.size())
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, init.begin(), m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    Buffer (T const* h_p, const std::size_t n)
+        : m_size(n)
+    {
+        if (m_size == 0) return;
+#ifdef AMREX_USE_GPU
+        h_data = static_cast<T*>(The_Pinned_Arena()->alloc(m_size*sizeof(T)));
+#else
+        h_data = static_cast<T*>(std::malloc(m_size*sizeof(T)));
+#endif
+        std::memcpy(h_data, h_p, m_size*sizeof(T));
+#ifdef AMREX_USE_GPU
+        if (Gpu::inLaunchRegion())
+        {
+            d_data = static_cast<T*>(The_Arena()->alloc(m_size*sizeof(T)));
+            Gpu::htod_memcpy_async(d_data, h_data, m_size*sizeof(T));
+#ifdef AMREX_USE_DPCPP
+            if (Gpu::onNullStream()) Gpu::synchronize();
+#endif
+        }
+#endif
+    }
+
+    ~Buffer () { clear(); }
+
+    Buffer (Buffer const&) = delete;
+    Buffer (Buffer &&) = delete;
+    void operator= (Buffer const&) = delete;
+    void operator= (Buffer &&) = delete;
+
+    T const* data () const noexcept { return (d_data != nullptr) ? d_data : h_data; }
+    T* data () noexcept { return (d_data != nullptr) ? d_data : h_data; }
+
+    T const* hostData () const noexcept { return h_data; }
+    T* hostDatat () noexcept { return h_data; }
+
+    std::size_t size () const noexcept { return m_size; }
+
+    void clear ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data) The_Arena()->free(d_data);
+        if (h_data) The_Pinned_Arena()->free(h_data);
+#else
+        std::free(h_data);
+#endif
+        d_data = nullptr;
+        h_data = nullptr;
+    }
+
+    T* copyToHost ()
+    {
+#ifdef AMREX_USE_GPU
+        if (d_data)
+        {
+            Gpu::dtoh_memcpy_async(h_data, d_data, m_size*sizeof(T));
+            Gpu::streamSynchronize();
+        }
+#endif
+        return h_data;
+    }
+
+private:
+    std::size_t m_size;
+    T* d_data = nullptr;
+    T* h_data = nullptr;
+};
+
+}
+}
+
+#endif
diff --git a/Src/Base/AMReX_GpuFuse.H b/Src/Base/AMReX_GpuFuse.H
index effad729364..80007f849ae 100644
--- a/Src/Base/AMReX_GpuFuse.H
+++ b/Src/Base/AMReX_GpuFuse.H
@@ -77,15 +77,69 @@ PutLambda (Lambda const& f, char* buf)
     return Arena::align(sizeof(Lambda));
 }
 
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda1DLauncherFnPtr (Lambda const& f, Lambda1DLauncher& launcher)
+    -> decltype(f(0))
+{
+    amrex::ignore_unused(f);
+    launcher = &LaunchLambda1D<Lambda>;
+}
+
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda1DLauncherFnPtr (Lambda const& f, Lambda1DLauncher& launcher)
+    -> decltype(f(0,Gpu::Handler{}))
+{
+    amrex::ignore_unused(f);
+    launcher = nullptr;
+}
+
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda3DLauncherFnPtr (Lambda const& f, Lambda3DLauncher& launcher)
+    -> decltype(f(0,0,0))
+{
+    amrex::ignore_unused(f);
+    launcher = &LaunchLambda3D<Lambda>;
+}
+
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda3DLauncherFnPtr (Lambda const& f, Lambda3DLauncher& launcher)
+    -> decltype(f(0,0,0,Gpu::Handler{}))
+{
+    amrex::ignore_unused(f);
+    launcher = nullptr;
+}
+
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda4DLauncherFnPtr (Lambda const& f, Lambda4DLauncher& launcher)
+    -> decltype(f(0,0,0,0))
+{
+    amrex::ignore_unused(f);
+    launcher = &LaunchLambda4D<Lambda>;
+}
+
+template <typename Lambda>
+AMREX_GPU_DEVICE
+auto GetLambda4DLauncherFnPtr (Lambda const& f, Lambda4DLauncher& launcher)
+    -> decltype(f(0,0,0,0,Gpu::Handler{}))
+{
+    amrex::ignore_unused(f);
+    launcher = nullptr;
+}
+
 template <typename Lambda>
 void
-PutLambda1DLauncher (FuseHelper* helper)
+PutLambda1DLauncher (FuseHelper* helper, Lambda const& f)
 {
     static Lambda1DLauncher fp = nullptr;
     if (fp == nullptr) {
         launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE ()
         {
-            helper->m_fp.L1D = &LaunchLambda1D<Lambda>;
+            GetLambda1DLauncherFnPtr(f, helper->m_fp.L1D);
         });
         Gpu::synchronize();
         fp = helper->m_fp.L1D;
@@ -96,13 +150,13 @@ PutLambda1DLauncher (FuseHelper* helper)
 
 template <typename Lambda>
 void
-PutLambda3DLauncher (FuseHelper* helper)
+PutLambda3DLauncher (FuseHelper* helper, Lambda const& f)
 {
     static Lambda3DLauncher fp = nullptr;
     if (fp == nullptr) {
         launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE ()
         {
-            helper->m_fp.L3D = &LaunchLambda3D<Lambda>;
+            GetLambda3DLauncherFnPtr(f, helper->m_fp.L3D);
         });
         Gpu::synchronize();
         fp = helper->m_fp.L3D;
@@ -113,13 +167,13 @@ PutLambda3DLauncher (FuseHelper* helper)
 
 template <typename Lambda>
 void
-PutLambda4DLauncher (FuseHelper* helper)
+PutLambda4DLauncher (FuseHelper* helper, Lambda const& f)
 {
     static Lambda4DLauncher fp = nullptr;
     if (fp == nullptr) {
         launch_global<<<1,1>>>([=] AMREX_GPU_DEVICE ()
         {
-            helper->m_fp.L4D = &LaunchLambda4D<Lambda>;
+            GetLambda4DLauncherFnPtr(f, helper->m_fp.L4D);
         });
         Gpu::synchronize();
         fp = helper->m_fp.L4D;
@@ -152,7 +206,7 @@ public:
         if (bx.isEmpty()) return;
         using Lambda = typename std::decay<F>::type;
         Register_doit<Lambda>(bx, 0, f);
-        PutLambda3DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1));
+        PutLambda3DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1), f);
     }
 
     template <typename F>
@@ -163,7 +217,7 @@ public:
         if (bx.isEmpty()) return;
         using Lambda = typename std::decay<F>::type;
         Register_doit<Lambda>(bx, ncomp, f);
-        PutLambda4DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1));
+        PutLambda4DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1), f);
     }
 
     template <typename T, typename F, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
@@ -176,7 +230,7 @@ public:
         if (N <= 0) return;
         using Lambda = typename std::decay<F>::type;
         Register_doit<Lambda>(Box(), N, f);
-        PutLambda1DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1));
+        PutLambda1DLauncher<Lambda>(m_helper_buf+(m_nlambdas-1), f);
     }
 
     void Launch ();
diff --git a/Src/Base/AMReX_GpuKernelInfo.H b/Src/Base/AMReX_GpuKernelInfo.H
index 72076bbb64f..dc7e452ee11 100644
--- a/Src/Base/AMReX_GpuKernelInfo.H
+++ b/Src/Base/AMReX_GpuKernelInfo.H
@@ -9,9 +9,12 @@ class KernelInfo
 {
 public:
     KernelInfo& setFusible (bool flag) { fusible = flag; return *this; }
+    KernelInfo& setReduction (bool flag) { has_reduction = flag; return *this; }
     bool isFusible () const { return fusible; }
+    bool hasReduction () const { return has_reduction; }
 private:
     bool fusible = false;
+    bool has_reduction = false;
 };
 
 }}
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 6eee4c94800..6799eb9d133 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -4,6 +4,56 @@
 
 namespace amrex {
 
+namespace detail {
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i)
+        noexcept -> decltype(f(0))
+    {
+        f(i);
+    }
+
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i)
+        noexcept -> decltype(f(0,Gpu::Handler{}))
+    {
+        f(i,Gpu::Handler{});
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k)
+        noexcept -> decltype(f(0,0,0))
+    {
+        f(i,j,k);
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k)
+        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
+    {
+        f(i,j,k,Gpu::Handler{});
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T n)
+        noexcept -> decltype(f(0,0,0,0))
+    {
+        f(i,j,k,n);
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T n)
+        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
+    {
+        f(i,j,k,n,Gpu::Handler{});
+    }
+}
+
 template<typename T, typename L>
 void launch (T const& n, L&& f, std::size_t /*shared_mem_bytes*/=0) noexcept
 {
@@ -14,7 +64,7 @@ template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<
 void For (T n, L&& f) noexcept
 {
     for (T i = 0; i < n; ++i) {
-        f(i);
+        detail::call_f(f,i);
     }
 }
 
@@ -29,7 +79,7 @@ void ParallelFor (T n, L&& f) noexcept
 {
     AMREX_PRAGMA_SIMD
     for (T i = 0; i < n; ++i) {
-        f(i);
+        detail::call_f(f,i);
     }
 }
 
@@ -47,7 +97,7 @@ void For (Box const& box, L&& f) noexcept
     for (int k = lo.z; k <= hi.z; ++k) {
     for (int j = lo.y; j <= hi.y; ++j) {
     for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
+        detail::call_f(f,i,j,k);
     }}}
 }
 
@@ -66,7 +116,7 @@ void ParallelFor (Box const& box, L&& f) noexcept
     for (int j = lo.y; j <= hi.y; ++j) {
     AMREX_PRAGMA_SIMD
     for (int i = lo.x; i <= hi.x; ++i) {
-        f(i,j,k);
+        detail::call_f(f,i,j,k);
     }}}
 }
 
@@ -85,7 +135,7 @@ void For (Box const& box, T ncomp, L&& f) noexcept
         for (int k = lo.z; k <= hi.z; ++k) {
         for (int j = lo.y; j <= hi.y; ++j) {
         for (int i = lo.x; i <= hi.x; ++i) {
-            f(i,j,k,n);
+            detail::call_f(f,i,j,k,n);
         }}}
     }
 }
@@ -106,7 +156,7 @@ void ParallelFor (Box const& box, T ncomp, L&& f) noexcept
         for (int j = lo.y; j <= hi.y; ++j) {
         AMREX_PRAGMA_SIMD
         for (int i = lo.x; i <= hi.x; ++i) {
-            f(i,j,k,n);
+            detail::call_f(f,i,j,k,n);
         }}}
     }
 }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index e53fbc67606..540dfd49b0e 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -38,7 +38,7 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
             [=] (sycl::nd_item<1> item)
             AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
             {
-                f(Gpu::Handler{item,shared_data.get_pointer()});
+                f(Gpu::Handler{&item,shared_data.get_pointer()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -94,8 +94,58 @@ void launch (T const& n, L&& f) noexcept
     }
 }
 
+namespace detail {
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i, Gpu::Handler const&)
+        noexcept -> decltype(f(0))
+    {
+        f(i);
+    }
+
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i, Gpu::Handler const& handler)
+        noexcept -> decltype(f(0,Gpu::Handler{}))
+    {
+        f(i,handler);
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, Gpu::Handler const&)
+        noexcept -> decltype(f(0,0,0))
+    {
+        f(i,j,k);
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, Gpu::Handler const& handler)
+        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
+    {
+        f(i,j,k,handler);
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const&)
+        noexcept -> decltype(f(0,0,0,0))
+    {
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T ncomp, Gpu::Handler const& handler)
+        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
+    {
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,handler);
+    }
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
     const auto ec = Gpu::ExecutionConfig(n);
@@ -105,25 +155,46 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, T n, L&& f) noexcept
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
-        q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                for (T i = item.get_global_id(0), stride = item.get_global_range(0);
-                     i < n; i += stride) {
-                    f(i);
-                }
+        if (info.hasReduction()) {
+            q.submit([&] (sycl::handler& h) {
+                sycl::accessor<unsigned long long, 1, sycl::access::mode::read_write,
+                               sycl::access::target::local>
+                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < n; i += stride) {
+                        int n_active_threads = amrex::min(n-i+(T)item.get_local_id(0),
+                                                          (T)item.get_local_range(0));
+                        detail::call_f(f, i, Gpu::Handler{&item, shared_data.get_pointer(),
+                                                          n_active_threads});
+                    }
+                });
             });
-        });
+        } else {
+            q.submit([&] (sycl::handler& h) {
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (T i = item.get_global_id(0), stride = item.get_global_range(0);
+                         i < n; i += stride) {
+                        detail::call_f(f, i, Gpu::Handler{&item});
+                    }
+                });
+            });
+        }
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
     }
 }
 
 template <typename L>
-void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -136,31 +207,58 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, L&& f) noexce
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
-        q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   (len.x*len.y);
-                    int j = (icell - k*(len.x*len.y)) /   len.x;
-                    int i = (icell - k*(len.x*len.y)) - j*len.x;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    f(i,j,k);
-                }
+        if (info.hasReduction()) {
+            q.submit([&] (sycl::handler& h) {
+                sycl::accessor<unsigned long long, 1, sycl::access::mode::read_write,
+                               sycl::access::target::local>
+                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < ncells; icell += stride) {
+                        int k =  icell /   (len.x*len.y);
+                        int j = (icell - k*(len.x*len.y)) /   len.x;
+                        int i = (icell - k*(len.x*len.y)) - j*len.x;
+                        i += lo.x;
+                        j += lo.y;
+                        k += lo.z;
+                        int n_active_threads = amrex::min(ncells-i+(int)item.get_local_id(0),
+                                                          (int)item.get_local_range(0));
+                        detail::call_f(f, i, j, k, Gpu::Handler{&item, shared_data.get_pointer(),
+                                                                n_active_threads});
+                    }
+                });
             });
-        });
+        } else {
+            q.submit([&] (sycl::handler& h) {
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < ncells; icell += stride) {
+                        int k =  icell /   (len.x*len.y);
+                        int j = (icell - k*(len.x*len.y)) /   len.x;
+                        int i = (icell - k*(len.x*len.y)) - j*len.x;
+                        i += lo.x;
+                        j += lo.y;
+                        k += lo.z;
+                        detail::call_f(f,i,j,k,Gpu::Handler{&item});
+                    }
+                });
+            });
+        }
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
     }
 }
 
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
-void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&& f) noexcept
+void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexcept
 {
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
@@ -173,26 +271,52 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box, T ncomp, L&&
     int nthreads_total = nthreads_per_block * ec.numBlocks.x;
     auto& q = Gpu::Device::streamQueue();
     try {
-        q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(nthreads_per_block)),
-            [=] (sycl::nd_item<1> item)
-            AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
-            {
-                for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
-                     icell < ncells; icell += stride) {
-                    int k =  icell /   (len.x*len.y);
-                    int j = (icell - k*(len.x*len.y)) /   len.x;
-                    int i = (icell - k*(len.x*len.y)) - j*len.x;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    for (T n = 0; n < ncomp; ++n) {
-                        f(i,j,k,n);
+        if (info.hasReduction()) {
+            q.submit([&] (sycl::handler& h) {
+                sycl::accessor<unsigned long long, 1, sycl::access::mode::read_write,
+                               sycl::access::target::local>
+                    shared_data(sycl::range<1>(Gpu::Device::warp_size), h);
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < ncells; icell += stride) {
+                        int k =  icell /   (len.x*len.y);
+                        int j = (icell - k*(len.x*len.y)) /   len.x;
+                        int i = (icell - k*(len.x*len.y)) - j*len.x;
+                        i += lo.x;
+                        j += lo.y;
+                        k += lo.z;
+                        int n_active_threads = amrex::min(ncells-i+(int)item.get_local_id(0),
+                                                          (int)item.get_local_range(0));
+                        detail::call_f(f, i, j, k, ncomp,
+                                       Gpu::Handler{&item, shared_data.get_pointer(),
+                                                    n_active_threads});
                     }
-                }
+                });
             });
-        });
+        } else {
+            q.submit([&] (sycl::handler& h) {
+                h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                                 sycl::range<1>(nthreads_per_block)),
+                [=] (sycl::nd_item<1> item)
+                AMREX_REQUIRE_SUBGROUP_SIZE(Gpu::Device::warp_size)
+                {
+                    for (int icell = item.get_global_id(0), stride = item.get_global_range(0);
+                         icell < ncells; icell += stride) {
+                        int k =  icell /   (len.x*len.y);
+                        int j = (icell - k*(len.x*len.y)) /   len.x;
+                        int i = (icell - k*(len.x*len.y)) - j*len.x;
+                        i += lo.x;
+                        j += lo.y;
+                        k += lo.z;
+                        detail::call_f(f,i,j,k,ncomp,Gpu::Handler{&item});
+                    }
+                });
+            });
+        }
     } catch (sycl::exception const& ex) {
         amrex::Abort(std::string("ParallelFor: ")+ex.what()+"!!!!!");
     }
@@ -685,7 +809,7 @@ void VecReduce (N n, T const& init_val, L1&& f1, L2&& f2) noexcept
                      i < n; i += stride) {
                     f1(i,&r);
                 }
-                f2(r,Gpu::Handler{item,shared_data.get_pointer()});
+                f2(r,Gpu::Handler{&item,shared_data.get_pointer()});
             });
         });
     } catch (sycl::exception const& ex) {
@@ -733,24 +857,75 @@ void launch (T const& n, L&& f) noexcept
     AMREX_GPU_ERROR_CHECK();
 }
 
+namespace detail {
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i, N /*nleft*/)
+        noexcept -> decltype(f(0))
+    {
+        f(i);
+    }
+
+    template <typename F, typename N>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, N i, N nleft)
+        noexcept -> decltype(f(0,Gpu::Handler{}))
+    {
+        f(i,Gpu::Handler(amrex::min(nleft,(N)blockDim.x)));
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, int /*nleft*/)
+        noexcept -> decltype(f(0,0,0))
+    {
+        f(i,j,k);
+    }
+
+    template <typename F>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, int nleft)
+        noexcept -> decltype(f(0,0,0,Gpu::Handler{}))
+    {
+        f(i,j,k,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T ncomp, int /*nleft*/)
+        noexcept -> decltype(f(0,0,0,0))
+    {
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n);
+    }
+
+    template <typename F, typename T>
+    AMREX_GPU_DEVICE
+    auto call_f (F const& f, int i, int j, int k, T ncomp, int nleft)
+        noexcept -> decltype(f(0,0,0,0,Gpu::Handler{}))
+    {
+        for (T n = 0; n < ncomp; ++n) f(i,j,k,n,Gpu::Handler(amrex::min(nleft,(int)blockDim.x)));
+    }
+}
+
 template <typename T, typename L, typename M=amrex::EnableIf_t<std::is_integral<T>::value> >
 amrex::EnableIf_t<MaybeDeviceRunnable<L>::value>
 ParallelFor (Gpu::KernelInfo const& info, T n, L&& f) noexcept
 {
     if (amrex::isEmpty(n)) return;
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) {
+    if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && n <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(n, f);
     } else
 #endif
     {
         amrex::ignore_unused(info);
         const auto ec = Gpu::ExecutionConfig(n);
+
         AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             for (T i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
                  i < n; i += stride) {
-                f(i);
+                detail::call_f(f, i, (n-i+(T)threadIdx.x));
             }
         });
         AMREX_GPU_ERROR_CHECK();
@@ -764,7 +939,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box, f);
     } else
 #endif
@@ -773,17 +948,19 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, L&& f) noexcept
         const auto lo  = amrex::lbound(box);
         const auto len = amrex::length(box);
         const auto ec = Gpu::ExecutionConfig(ncells);
+
         AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
-                 icell < ncells; icell += stride) {
+                 icell < ncells; icell += stride)
+            {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
                 int i = (icell - k*(len.x*len.y)) - j*len.x;
                 i += lo.x;
                 j += lo.y;
                 k += lo.z;
-                f(i,j,k);
+                detail::call_f(f, i, j, k, (ncells-icell+(int)threadIdx.x));
             }
         });
         AMREX_GPU_ERROR_CHECK();
@@ -797,7 +974,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce
     if (amrex::isEmpty(box)) return;
     int ncells = box.numPts();
 #ifdef AMREX_USE_CUDA
-    if (Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
+    if (!info.hasReduction() && Gpu::inFuseRegion() && info.isFusible() && ncells <= Gpu::getFuseSizeThreshold()) {
         Gpu::Register(box, ncomp, f);
     } else
 #endif
@@ -806,6 +983,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce
         const auto lo  = amrex::lbound(box);
         const auto len = amrex::length(box);
         const auto ec = Gpu::ExecutionConfig(ncells);
+
         AMREX_LAUNCH_KERNEL(ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept {
             for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
@@ -816,9 +994,7 @@ ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L&& f) noexce
                 i += lo.x;
                 j += lo.y;
                 k += lo.z;
-                for (T n = 0; n < ncomp; ++n) {
-                    f(i,j,k,n);
-                }
+                detail::call_f(f, i, j, k, ncomp, (ncells-icell+(int)threadIdx.x));
             }
         });
         AMREX_GPU_ERROR_CHECK();
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index 80a8052df8a..9a80512cafa 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -9,6 +9,29 @@
 #include <AMReX_GpuUtility.H>
 #include <AMReX_Functional.H>
 
+//
+// Public interface
+//
+namespace amrex { namespace Gpu {
+    template <typename T>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept;
+
+    template <typename T>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept;
+
+    template <typename T>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept;
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept;
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept;
+}}
+
 //
 // Reduce functions based on _shfl_down_sync
 //
@@ -36,8 +59,8 @@ AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
 {
     T* shared = (T*)h.local;
-    int tid = h.item.get_local_id(0);
-    sycl::ONEAPI::sub_group const& sg = h.item.get_sub_group();
+    int tid = h.item->get_local_id(0);
+    sycl::ONEAPI::sub_group const& sg = h.item->get_sub_group();
     int lane = sg.get_local_id()[0];
     int wid = sg.get_group_id()[0];
     int numwarps = sg.get_group_range()[0];
@@ -46,56 +69,134 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const& h)
     // if this reduction call is occurring multiple times in a kernel,
     // and since we don't know how many times the user is calling it,
     // we do it always to be safe.
-    h.item.barrier(sycl::access::fence_space::local_space);
+    h.item->barrier(sycl::access::fence_space::local_space);
     if (lane == 0) shared[wid] = x;
-    h.item.barrier(sycl::access::fence_space::local_space);
+    h.item->barrier(sycl::access::fence_space::local_space);
     bool b = (tid == 0) || (tid < numwarps);
     x =  b ? shared[lane] : x0;
     if (wid == 0) x = warp_reduce(x, sg);
     return x;
 }
 
+template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
+                          T x0, Gpu::Handler const& handler)
+{
+   sycl::ONEAPI::sub_group const& sg = handler.item->get_sub_group();
+   int wid = sg.get_group_id()[0];
+   if ((wid+1)*warpSize <= handler.numActiveThreads) {
+       x = warp_reduce(x, sg); // full warp
+       if (sg.get_local_id()[0] == 0) atomic_op(dest, x);
+   } else {
+       atomic_op(dest, x);
+   }
+}
+
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
+void deviceReduceSum_full (T * dest, T source, Gpu::Handler const& h) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0, h);
-    if (h.item.get_local_id(0) == 0) Gpu::Atomic::AddNoRet(dest, source);
+    if (h.item->get_local_id(0) == 0) Gpu::Atomic::AddNoRet(dest, source);
 }
 
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
+void deviceReduceMin_full (T * dest, T source, Gpu::Handler const& h) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Less<T> >(), source, h);
-    if (h.item.get_local_id(0) == 0) Gpu::Atomic::Min(dest, source);
+    if (h.item->get_local_id(0) == 0) Gpu::Atomic::Min(dest, source);
 }
 
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
+void deviceReduceMax_full (T * dest, T source, Gpu::Handler const& h) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Greater<T> >(), source, h);
-    if (h.item.get_local_id(0) == 0) Gpu::Atomic::Max(dest, source);
+    if (h.item->get_local_id(0) == 0) Gpu::Atomic::Max(dest, source);
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
+void deviceReduceLogicalAnd_full (int * dest, int source, Gpu::Handler const& h) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1, h);
-    if (h.item.get_local_id(0) == 0) Gpu::Atomic::LogicalAnd(dest, source);
+    if (h.item->get_local_id(0) == 0) Gpu::Atomic::LogicalAnd(dest, source);
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
+void deviceReduceLogicalOr_full (int * dest, int source, Gpu::Handler const& h) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0, h);
-    if (h.item.get_local_id(0) == 0) Gpu::Atomic::LogicalOr(dest, source);
+    if (h.item->get_local_id(0) == 0) Gpu::Atomic::LogicalOr(dest, source);
+}
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceSum (T * dest, T source, Gpu::Handler const& h) noexcept
+{
+    if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) {
+        deviceReduceSum_full(dest, source, h);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
+             Gpu::AtomicAdd<T>(), (T)0, h);
+    }
+}
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceMin (T * dest, T source, Gpu::Handler const& h) noexcept
+{
+    if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) {
+        deviceReduceMin_full(dest, source, h);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Less<T> >(),
+             Gpu::AtomicMin<T>(), source, h);
+    }
+}
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceMax (T * dest, T source, Gpu::Handler const& h) noexcept
+{
+    if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) {
+        deviceReduceMax_full(dest, source, h);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Greater<T> >(),
+             Gpu::AtomicMax<T>(), source, h);
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& h) noexcept
+{
+    if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) {
+        deviceReduceLogicalAnd_full(dest, source, h);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
+             Gpu::AtomicLogicalAnd<int>(), 1, h);
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& h) noexcept
+{
+    if (h.numActiveThreads >= int(h.item->get_local_range(0)) || h.numActiveThreads <= 0) {
+        deviceReduceLogicalOr_full(dest, source, h);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
+             Gpu::AtomicLogicalOr<int>(), 0, h);
+    }
 }
 
 #elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
@@ -136,9 +237,23 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
     return x;
 }
 
+template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
+                          T x0, Gpu::Handler const& handler)
+{
+    int warp = (int)threadIdx.x / warpSize;
+    if ((warp+1)*warpSize <= handler.numActiveThreads) {
+        x = warp_reduce(x); // full warp
+        if (threadIdx.x % warpSize == 0) atomic_op(dest, x);
+    } else {
+        atomic_op(dest,x);
+    }
+}
+
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceSum (T * dest, T source) noexcept
+void deviceReduceSum_full (T * dest, T source) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(), (T)0);
@@ -147,7 +262,7 @@ void deviceReduceSum (T * dest, T source) noexcept
 
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceMin (T * dest, T source) noexcept
+void deviceReduceMin_full (T * dest, T source) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Less<T> >(), source);
@@ -156,7 +271,7 @@ void deviceReduceMin (T * dest, T source) noexcept
 
 template <typename T>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceMax (T * dest, T source) noexcept
+void deviceReduceMax_full (T * dest, T source) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Greater<T> >(), source);
@@ -164,7 +279,7 @@ void deviceReduceMax (T * dest, T source) noexcept
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceLogicalAnd (int * dest, int source) noexcept
+void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(), 1);
@@ -172,18 +287,81 @@ void deviceReduceLogicalAnd (int * dest, int source) noexcept
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void deviceReduceLogicalOr (int * dest, int source) noexcept
+void deviceReduceLogicalOr_full (int * dest, int source) noexcept
 {
     source = Gpu::blockReduce<Gpu::Device::warp_size>
         (source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(), 0);
     if (threadIdx.x == 0) Gpu::Atomic::LogicalOr(dest, source);
 }
 
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceSum (T * dest, T source, Gpu::Handler const& handler) noexcept
+{
+    if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) {
+        deviceReduceSum_full(dest, source);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Plus<T> >(),
+             Gpu::AtomicAdd<T>(), (T)0, handler);
+    }
+}
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceMin (T * dest, T source, Gpu::Handler const& handler) noexcept
+{
+    if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) {
+        deviceReduceMin_full(dest, source);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Less<T> >(),
+             Gpu::AtomicMin<T>(), source, handler);
+    }
+}
+
+template <typename T>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceMax (T * dest, T source, Gpu::Handler const& handler) noexcept
+{
+    if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) {
+        deviceReduceMax_full(dest, source);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,T,amrex::Greater<T> >(),
+             Gpu::AtomicMax<T>(), source, handler);
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const& handler) noexcept
+{
+    if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) {
+        deviceReduceLogicalAnd_full(dest, source);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalAnd<int> >(),
+             Gpu::AtomicLogicalAnd<int>(), 1, handler);
+    }
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const& handler) noexcept
+{
+    if (handler.numActiveThreads >= (int)blockDim.x || handler.numActiveThreads <= 0) {
+        deviceReduceLogicalOr_full(dest, source);
+    } else {
+        Gpu::blockReduce_partial<Gpu::Device::warp_size>
+            (dest, source, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::LogicalOr<int> >(),
+             Gpu::AtomicLogicalOr<int>(), 0, handler);
+    }
+}
+
 #else
 
 template <typename T>
 AMREX_FORCE_INLINE
-void deviceReduceSum (T * dest, T source) noexcept
+void deviceReduceSum_full (T * dest, T source) noexcept
 {
 #ifdef _OPENMP
 #pragma omp atomic
@@ -193,7 +371,14 @@ void deviceReduceSum (T * dest, T source) noexcept
 
 template <typename T>
 AMREX_FORCE_INLINE
-void deviceReduceMin (T * dest, T source) noexcept
+void deviceReduceSum (T * dest, T source, Gpu::Handler const&) noexcept
+{
+    deviceReduceSum_full(dest, source);
+}
+
+template <typename T>
+AMREX_FORCE_INLINE
+void deviceReduceMin_full (T * dest, T source) noexcept
 {
 #ifdef _OPENMP
 #pragma omp critical (gpureduce_reducemin)
@@ -203,7 +388,14 @@ void deviceReduceMin (T * dest, T source) noexcept
 
 template <typename T>
 AMREX_FORCE_INLINE
-void deviceReduceMax (T * dest, T source) noexcept
+void deviceReduceMin (T * dest, T source, Gpu::Handler const&) noexcept
+{
+    deviceReduceMin_full(dest, source);
+}
+
+template <typename T>
+AMREX_FORCE_INLINE
+void deviceReduceMax_full (T * dest, T source) noexcept
 {
 #ifdef _OPENMP
 #pragma omp critical (gpureduce_reducemax)
@@ -211,8 +403,15 @@ void deviceReduceMax (T * dest, T source) noexcept
     *dest = std::max(*dest, source);
 }
 
+template <typename T>
+AMREX_FORCE_INLINE
+void deviceReduceMax (T * dest, T source, Gpu::Handler const&) noexcept
+{
+    deviceReduceMax_full(dest, source);
+}
+
 AMREX_FORCE_INLINE
-void deviceReduceLogicalAnd (int * dest, int source) noexcept
+void deviceReduceLogicalAnd_full (int * dest, int source) noexcept
 {
 #ifdef _OPENMP
 #pragma omp critical (gpureduce_reduceand)
@@ -221,7 +420,13 @@ void deviceReduceLogicalAnd (int * dest, int source) noexcept
 }
 
 AMREX_FORCE_INLINE
-void deviceReduceLogicalOr (int * dest, int source) noexcept
+void deviceReduceLogicalAnd (int * dest, int source, Gpu::Handler const&) noexcept
+{
+    deviceReduceLogicalAnd_full(dest, source);
+}
+
+AMREX_FORCE_INLINE
+void deviceReduceLogicalOr_full (int * dest, int source) noexcept
 {
 #ifdef _OPENMP
 #pragma omp critical (gpureduce_reduceor)
@@ -229,6 +434,12 @@ void deviceReduceLogicalOr (int * dest, int source) noexcept
     *dest = (*dest) || source;
 }
 
+AMREX_FORCE_INLINE
+void deviceReduceLogicalOr (int * dest, int source, Gpu::Handler const&) noexcept
+{
+    deviceReduceLogicalOr_full(dest, source);
+}
+
 #endif
 }}
 
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 42b6fef73a1..fa660545738 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -26,18 +26,41 @@ struct gpuStream_t {
     bool operator== (gpuStream_t const& rhs) noexcept { return queue == rhs.queue; }
 };
 
-namespace Gpu {
-struct Handler {
-    Handler (sycl::nd_item<1> const& a_item, void* a_local = nullptr)
-        : item(a_item), local(a_local) {}
-    sycl::nd_item<1> const& item;
-    void* local; // DPC++ local memory
-};
+#endif
+
 }
 
 #endif
 
-}
+namespace amrex { namespace Gpu {
+
+#if defined(AMREX_USE_DPCPP)
+
+struct Handler
+{
+    Handler (sycl::nd_item<1> const* a_item = nullptr, void* a_local = nullptr,
+             int a_n_active_threds = -1)
+        : item(a_item), local(a_local), numActiveThreads(a_n_active_threds) {}
+    sycl::nd_item<1> const* item;
+    void* local; // DPC++ local memory
+    int numActiveThreads;
+};
+
+#elif defined(AMREX_USE_GPU)
+
+struct Handler
+{
+    AMREX_GPU_HOST_DEVICE constexpr Handler (int n_active_threads = -1)
+        : numActiveThreads(n_active_threads) {}
+    int numActiveThreads;
+};
+
+#else
+
+struct Handler {};
 
 #endif
+
+}}
+
 #endif
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index b63b1220dcb..c660adb91e2 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -77,12 +77,12 @@ struct ReduceOpSum
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
-        Gpu::deviceReduceSum(&d,s,h);
+        Gpu::deviceReduceSum_full(&d,s,h);
     }
 #else
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceSum(&d,s); }
+    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceSum_full(&d,s); }
 #endif
 
     template <typename T>
@@ -99,12 +99,12 @@ struct ReduceOpMin
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
-        Gpu::deviceReduceMin(&d,s,h);
+        Gpu::deviceReduceMin_full(&d,s,h);
     }
 #else
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMin(&d,s); }
+    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMin_full(&d,s); }
 #endif
 
     template <typename T>
@@ -121,12 +121,12 @@ struct ReduceOpMax
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void parallel_update (T& d, T const& s, Gpu::Handler const& h) const noexcept {
-        Gpu::deviceReduceMax(&d,s,h);
+        Gpu::deviceReduceMax_full(&d,s,h);
     }
 #else
     template <typename T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMax(&d,s); }
+    void parallel_update (T& d, T const& s) const noexcept { Gpu::deviceReduceMax_full(&d,s); }
 #endif
 
     template <typename T>
@@ -142,11 +142,11 @@ struct ReduceOpLogicalAnd
 #ifdef AMREX_USE_DPCPP
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void parallel_update (int& d, int s, Gpu::Handler const& h) const noexcept {
-        Gpu::deviceReduceLogicalAnd(&d,s,h);
+        Gpu::deviceReduceLogicalAnd_full(&d,s,h);
     }
 #else
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalAnd(&d,s); }
+    void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalAnd_full(&d,s); }
 #endif
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
@@ -160,11 +160,11 @@ struct ReduceOpLogicalOr
 #ifdef AMREX_USE_DPCPP
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void parallel_update (int& d, int s, Gpu::Handler const& h) const noexcept {
-        Gpu::deviceReduceLogicalOr(&d,s,h);
+        Gpu::deviceReduceLogicalOr_full(&d,s,h);
     }
 #else
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalOr(&d,s); }
+    void parallel_update (int& d, int s) const noexcept { Gpu::deviceReduceLogicalOr_full(&d,s); }
 #endif
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
@@ -295,7 +295,7 @@ public:
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
         {
             ReduceTuple r = *(dp+1);
-            for (int icell = gh.item.get_global_id(0), stride = gh.item.get_global_range(0);
+            for (int icell = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
                  icell < ncells; icell += stride) {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
@@ -363,7 +363,7 @@ public:
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
         {
             ReduceTuple r = *(dp+1);
-            for (int icell = gh.item.get_global_id(0), stride = gh.item.get_global_range(0);
+            for (int icell = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
                  icell < ncells; icell += stride) {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
@@ -415,7 +415,7 @@ public:
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
         {
             ReduceTuple r = *(dp+1);
-            for (N i = gh.item.get_global_id(0), stride = gh.item.get_global_range(0);
+            for (N i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
                  i < n; i += stride) {
                 auto pr = f(i);
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
@@ -453,12 +453,12 @@ T Sum (N n, U const* v, T init_val, BOP bop)
 #ifdef AMREX_USE_DPCPP
     [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept
     {
-        Gpu::deviceReduceSum(dp, r, h);
+        Gpu::deviceReduceSum_full(dp, r, h);
     });
 #else
     [=] AMREX_GPU_DEVICE (T const& r) noexcept
     {
-        Gpu::deviceReduceSum(dp, r);
+        Gpu::deviceReduceSum_full(dp, r);
     });
 #endif
     return ds.dataValue();
@@ -495,12 +495,12 @@ T Min (N n, U const* v, T init_val, BOP bop)
 #ifdef AMREX_USE_DPCPP
     [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept
     {
-        Gpu::deviceReduceMin(dp, r, h);
+        Gpu::deviceReduceMin_full(dp, r, h);
     });
 #else
     [=] AMREX_GPU_DEVICE (T const& r) noexcept
     {
-        Gpu::deviceReduceMin(dp, r);
+        Gpu::deviceReduceMin_full(dp, r);
     });
 #endif
     return ds.dataValue();
@@ -526,12 +526,12 @@ T Max (N n, U const* v, T init_val, BOP bop)
 #ifdef AMREX_USE_DPCPP
     [=] AMREX_GPU_DEVICE (T const& r, Gpu::Handler const& h) noexcept
     {
-        Gpu::deviceReduceMax(dp, r, h);
+        Gpu::deviceReduceMax_full(dp, r, h);
     });
 #else
     [=] AMREX_GPU_DEVICE (T const& r) noexcept
     {
-        Gpu::deviceReduceMax(dp, r);
+        Gpu::deviceReduceMax_full(dp, r);
     });
 #endif
     return ds.dataValue();
@@ -560,14 +560,14 @@ std::pair<T,T> MinMax (N n, U const* v, MINOP minop, MAXOP maxop)
 #ifdef AMREX_USE_DPCPP
     [=] AMREX_GPU_DEVICE (Real2 const& r, Gpu::Handler const& h) noexcept
     {
-        Gpu::deviceReduceMin(dp  , r[0], h);
-        Gpu::deviceReduceMax(dp+1, r[1], h);
+        Gpu::deviceReduceMin_full(dp  , r[0], h);
+        Gpu::deviceReduceMax_full(dp+1, r[1], h);
     });
 #else
     [=] AMREX_GPU_DEVICE (Real2 const& r) noexcept
     {
-        Gpu::deviceReduceMin(dp  , r[0]);
-        Gpu::deviceReduceMax(dp+1, r[1]);
+        Gpu::deviceReduceMin_full(dp  , r[0]);
+        Gpu::deviceReduceMax_full(dp+1, r[1]);
     });
 #endif
     Gpu::dtoh_memcpy(hv.data(), dp, 2*sizeof(T));
@@ -793,7 +793,7 @@ T Sum (N n, U const* v, T init_val, BOP bop)
     },
     [=] (T r) noexcept
     {
-        Gpu::deviceReduceSum(dp, r);
+        Gpu::deviceReduceSum_full(dp, r);
     });
     return sum;
 }
@@ -830,7 +830,7 @@ T Min (N n, U const* v, T init_val, BOP bop)
     },
     [=] (T r) noexcept
     {
-        Gpu::deviceReduceMin(dp, r);
+        Gpu::deviceReduceMin_full(dp, r);
     });
     return mn;
 }
@@ -853,7 +853,7 @@ T Max (N n, U const* v, T init_val, BOP bop)
     },
     [=] (T r) noexcept
     {
-        Gpu::deviceReduceMax(dp, r);
+        Gpu::deviceReduceMax_full(dp, r);
     });
     return mx;
 }
@@ -878,8 +878,8 @@ std::pair<T,T> MinMax (N n, U const* v, MINOP minop, MAXOP maxop)
     },
     [=] (Real2 const& r) noexcept
     {
-        Gpu::deviceReduceMin(dp  , r[0]);
-        Gpu::deviceReduceMax(dp+1, r[1]);
+        Gpu::deviceReduceMin_full(dp  , r[0]);
+        Gpu::deviceReduceMax_full(dp+1, r[1]);
     });
     return std::make_pair(hv[0],hv[1]);
 }
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
index 7d03ce0faf5..e0a4f1e9ab1 100644
--- a/Src/Base/AMReX_Scan.H
+++ b/Src/Base/AMReX_Scan.H
@@ -199,14 +199,14 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
     amrex::launch(nblocks, nthreads, sm, stream,
     [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
     {
-        sycl::ONEAPI::sub_group const& sg = gh.item.get_sub_group();
+        sycl::ONEAPI::sub_group const& sg = gh.item->get_sub_group();
         int lane = sg.get_local_id()[0];
         int warp = sg.get_group_id()[0];
         int nwarps = sg.get_group_range()[0];
 
-        int threadIdxx = gh.item.get_local_id(0);
-        int blockDimx = gh.item.get_local_range(0);
-        int gridDimx = gh.item.get_group_range(0);
+        int threadIdxx = gh.item->get_local_id(0);
+        int blockDimx = gh.item->get_local_range(0);
+        int gridDimx = gh.item->get_group_range(0);
 
         T* shared = (T*)(gh.local);
         T* shared2 = shared + Gpu::Device::warp_size;
@@ -222,7 +222,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
                     (virtual_block_id_p, gridDimx);
                 virtual_block_id_shared = bid;
             }
-            gh.item.barrier(sycl::access::fence_space::local_space);
+            gh.item->barrier(sycl::access::fence_space::local_space);
             virtual_block_id = virtual_block_id_shared;
         }
 
@@ -265,7 +265,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
                 shared[warp] = x;
             }
 
-            gh.item.barrier(sycl::access::fence_space::local_space);
+            gh.item->barrier(sycl::access::fence_space::local_space);
 
             // The first warp will do scan on the warp sums for the
             // whole block.
@@ -279,7 +279,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
                 if (lane < nwarps) shared2[lane] = y;
             }
 
-            gh.item.barrier(sycl::access::fence_space::local_space);
+            gh.item->barrier(sycl::access::fence_space::local_space);
 
             // shared[0:nwarps) holds the inclusive sum of warp sums.
             
@@ -317,7 +317,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
                     int iblock = iblock0-lane;
                     detail::STVA<T> stva{'p', 0};
                     if (iblock >= 0) {
-                        stva = pbs[iblock].wait(gh.item);
+                        stva = pbs[iblock].wait(*gh.item);
                     }
 
                     T x = stva.value;
@@ -359,7 +359,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, Type type)
                 }
             }
 
-            gh.item.barrier(sycl::access::fence_space::local_space);
+            gh.item->barrier(sycl::access::fence_space::local_space);
 
             T exclusive_prefix = shared[0];
 
diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt
index bc5b2443755..7edf2ed77df 100644
--- a/Src/Base/CMakeLists.txt
+++ b/Src/Base/CMakeLists.txt
@@ -194,6 +194,7 @@ target_sources( amrex
    AMReX_GpuError.H
    AMReX_GpuDevice.H
    AMReX_GpuDevice.cpp
+   AMReX_GpuBuffer.H
    AMReX_GpuAtomic.H
    AMReX_GpuUtility.H
    AMReX_GpuUtility.cpp
diff --git a/Src/Base/Make.package b/Src/Base/Make.package
index e635b0928ee..a1a4dff0e62 100644
--- a/Src/Base/Make.package
+++ b/Src/Base/Make.package
@@ -83,6 +83,7 @@ C$(AMREX_BASE)_headers += AMReX_GpuMemory.H AMReX_GpuRange.H
 C$(AMREX_BASE)_headers += AMReX_GpuAtomic.H
 C$(AMREX_BASE)_sources += AMReX_GpuDevice.cpp AMReX_GpuUtility.cpp
 C$(AMREX_BASE)_headers += AMReX_GpuDevice.H   AMReX_GpuUtility.H
+C$(AMREX_BASE)_headers += AMReX_GpuBuffer.H
 
 C$(AMREX_BASE)_headers += AMReX_GpuAsyncArray.H
 C$(AMREX_BASE)_sources += AMReX_GpuAsyncArray.cpp
diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H
index d1ac0401fbe..0df73f1dfda 100644
--- a/Src/EB/AMReX_EB2_GeometryShop.H
+++ b/Src/EB/AMReX_EB2_GeometryShop.H
@@ -28,6 +28,7 @@ Real
 IF_f (F const& f, GpuArray<Real,AMREX_SPACEDIM> const& p) noexcept
 {
 #if AMREX_DEVICE_COMPILE
+    amrex::ignore_unused(f,p);
     amrex::Error("EB2::GeometryShop: how did this happen?");
     return 0.0;
 #else
diff --git a/Tutorials/GPU/ParallelReduce/GNUmakefile b/Tutorials/GPU/ParallelReduce/GNUmakefile
index 2754352287a..19a7b9ded31 100644
--- a/Tutorials/GPU/ParallelReduce/GNUmakefile
+++ b/Tutorials/GPU/ParallelReduce/GNUmakefile
@@ -13,6 +13,7 @@ USE_OMP   = FALSE
 USE_HIP   = FALSE
 
 TINY_PROFILE = TRUE
+BL_NO_FORT = TRUE
 
 include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 
diff --git a/Tutorials/GPU/ParallelReduce/main.cpp b/Tutorials/GPU/ParallelReduce/main.cpp
index ee1e82ad4a0..31fae4efa03 100644
--- a/Tutorials/GPU/ParallelReduce/main.cpp
+++ b/Tutorials/GPU/ParallelReduce/main.cpp
@@ -24,10 +24,12 @@ void main_main ()
 {
     int ncell = 512;
     int max_grid_size = 128;
+    int nghost = 0;
     {
         ParmParse pp;
         pp.query("ncell", ncell);
         pp.query("max_grid_size", max_grid_size);
+        pp.query("nghost", nghost);
     }
 
     BoxArray ba;
@@ -37,11 +39,11 @@ void main_main ()
         ba.maxSize(max_grid_size);
     }
 
-    MultiFab mf(ba,DistributionMapping{ba},1,0);
-    iMultiFab imf(ba,mf.DistributionMap(),1,0);
+    MultiFab mf(ba,DistributionMapping{ba},1,nghost);
+    iMultiFab imf(ba,mf.DistributionMap(),1,nghost);
     for (MFIter mfi(mf); mfi.isValid(); ++mfi)
     {
-        Box const& bx = mfi.validbox();
+        Box const& bx = mfi.fabbox();
         auto const& fab = mf.array(mfi);
         auto const& ifab = imf.array(mfi);
 
@@ -53,6 +55,96 @@ void main_main ()
         });
     }
 
+    int N = 1000000;
+    Gpu::DeviceVector<Real> vec(N);
+    Real* pvec = vec.dataPtr();
+    amrex::ParallelForRNG( N,
+    [=] AMREX_GPU_DEVICE (int i, RandomEngine const& engine) noexcept
+    {
+        pvec[i] = amrex::Random(engine) - 0.5;
+    });
+
+    {
+        BL_PROFILE("ParallelForReduction-box-3");
+
+        Gpu::Buffer<Real> da({0.0, std::numeric_limits<Real>::max(),
+                                       std::numeric_limits<Real>::lowest()});
+        Real* dp = da.data();
+        for (MFIter mfi(mf); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.fabbox();
+            Array4<Real const> const& fab = mf.const_array(mfi);
+            amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept
+            {
+                Gpu::deviceReduceSum(dp  , fab(i,j,k), handler);
+                Gpu::deviceReduceMin(dp+1, fab(i,j,k), handler);
+                Gpu::deviceReduceMax(dp+2, fab(i,j,k), handler);
+            });
+        }
+        Real* hp = da.copyToHost();
+        ParallelDescriptor::ReduceRealSum(hp[0]);
+        ParallelDescriptor::ReduceRealMin(hp[1]);
+        ParallelDescriptor::ReduceRealMax(hp[2]);
+        amrex::Print().SetPrecision(17) << "sum: "  << hp[0] << "\n"
+                                        << "min: "  << hp[1] << "\n"
+                                        << "max: "  << hp[2] << "\n";
+    }
+
+   {
+        BL_PROFILE("ParallelForReduction-box-sum");
+
+        Gpu::Buffer<Real> da({0.0});
+        Real* dp = da.data();
+        for (MFIter mfi(mf); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.fabbox();
+            Array4<Real const> const& fab = mf.const_array(mfi);
+            amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept
+            {
+                Gpu::deviceReduceSum(dp, fab(i,j,k), handler);
+            });
+        }
+        Real* hp = da.copyToHost();
+        ParallelDescriptor::ReduceRealSum(hp[0]);
+        amrex::Print().SetPrecision(17) << "sum: "  << hp[0] << "\n";
+    }
+
+   {
+        BL_PROFILE("ParallelForReduction-box-isum");
+
+        Gpu::Buffer<Long> da({0});
+        Long* dp = da.data();
+        for (MFIter mfi(imf); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.fabbox();
+            Array4<int const> const& ifab = imf.const_array(mfi);
+            amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), bx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k, Gpu::Handler const& handler) noexcept
+            {
+                Gpu::deviceReduceSum<Long>(dp, ifab(i,j,k), handler);
+            });
+        }
+        Long* hp = da.copyToHost();
+        ParallelDescriptor::ReduceLongSum(hp[0]);
+        amrex::Print().SetPrecision(17) << "isum: "  << hp[0] << "\n";
+    }
+
+    {
+        BL_PROFILE("ParallelForReduction-vec-1");
+        Gpu::Buffer<Real> da({0.0});
+        Real* dp = da.data();
+        amrex::ParallelFor(Gpu::KernelInfo().setReduction(true), N,
+        [=] AMREX_GPU_DEVICE (int i, Gpu::Handler const& handler) noexcept
+        {
+            Gpu::deviceReduceSum(dp, amrex::Math::abs(pvec[i]), handler);
+        });
+        Real* hp = da.copyToHost();
+        ParallelDescriptor::ReduceRealSum(hp[0]);
+        amrex::Print().SetPrecision(17) << "1-norm: "  << hp[0] << "\n";
+    }
+
     {
         BL_PROFILE("MultiFab::sum");
         amrex::Print().SetPrecision(17) << "sum: " << mf.sum() << "\n";
@@ -60,12 +152,12 @@ void main_main ()
 
     {
         BL_PROFILE("MultiFab::min");
-        amrex::Print().SetPrecision(17) << "min: " << mf.min(0) << "\n";
+        amrex::Print().SetPrecision(17) << "min: " << mf.min(0, nghost) << "\n";
     }
 
     {
         BL_PROFILE("MultiFab::max");
-        amrex::Print().SetPrecision(17) << "max: " << mf.max(0) << "\n";
+        amrex::Print().SetPrecision(17) << "max: " << mf.max(0, nghost) << "\n";
     }
 
     {
@@ -83,7 +175,7 @@ void main_main ()
 
         for (MFIter mfi(mf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& fab = mf.array(mfi);
             auto const& ifab = imf.array(mfi);
             reduce_op.eval(bx, reduce_data,
@@ -117,17 +209,17 @@ void main_main ()
 
         for (MFIter mfi(mf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& fab = mf.array(mfi);
             auto const& ifab = imf.array(mfi);
             reduce_op.eval(bx, reduce_data,
-            [=] AMREX_GPU_DEVICE (Box const& bx) -> ReduceTuple
+            [=] AMREX_GPU_DEVICE (Box const& b) -> ReduceTuple
             {
                 Real rsum = 0.;
                 Real rmin =  1.e30; // If not because of cuda 9.2,
                 Real rmax = -1.e30; // we should use numeric_limits.
                 Long lsum = 0;
-                amrex::Loop(bx,
+                amrex::Loop(b,
                 [=,&rsum,&rmin,&rmax,&lsum] (int i, int j, int k) {
                     Real x =  fab(i,j,k);
                     Long ix = static_cast<Long>(ifab(i,j,k));
@@ -161,7 +253,7 @@ void main_main ()
 
         for (MFIter mfi(mf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& fab = mf.array(mfi);
             reduce_op.eval(bx, reduce_data,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
@@ -185,7 +277,7 @@ void main_main ()
 
         for (MFIter mfi(mf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& fab = mf.array(mfi);
             reduce_op.eval(bx, reduce_data,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
@@ -209,7 +301,7 @@ void main_main ()
 
         for (MFIter mfi(mf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& fab = mf.array(mfi);
             reduce_op.eval(bx, reduce_data,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
@@ -233,7 +325,7 @@ void main_main ()
 
         for (MFIter mfi(imf); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.validbox();
+            const Box& bx = mfi.fabbox();
             auto const& ifab = imf.array(mfi);
             reduce_op.eval(bx, reduce_data,
             [=] AMREX_GPU_DEVICE (int i, int j, int k) -> ReduceTuple
@@ -248,15 +340,6 @@ void main_main ()
         amrex::Print() << "isum: " << hv << "\n";
     }
 
-    int N = 1000000;
-    Gpu::DeviceVector<Real> vec(N);
-    Real* pvec = vec.dataPtr();
-    amrex::ParallelForRNG( N,
-    [=] AMREX_GPU_DEVICE (int i, RandomEngine const& engine) noexcept
-    {
-        pvec[i] = amrex::Random(engine) - 0.5;
-    });
-
     {
         BL_PROFILE("VecReduce");