LLNL
diff --git a/‎.gitlab/custom-jobs-and-variables.yml
+2-2 b/‎.gitlab/custom-jobs-and-variables.yml
+2-2
diff --git a/‎docs/Licenses/rocprim-license.txt
+21 b/‎docs/Licenses/rocprim-license.txt
+21
diff --git a/‎docs/sphinx/user_guide/cook_book/reduction.rst
+22-5 b/‎docs/sphinx/user_guide/cook_book/reduction.rst
+22-5
diff --git a/‎docs/sphinx/user_guide/feature/policies.rst
+422-340 b/‎docs/sphinx/user_guide/feature/policies.rst
+422-340
diff --git a/‎include/RAJA/RAJA.hpp
+6 b/‎include/RAJA/RAJA.hpp
+6
diff --git a/‎include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+28-3 b/‎include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+28-3
@@ -25,7 +25,7 @@ variables:
 
 # Poodle
 # Arguments for top level allocation
-  POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1"
+  POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1"
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle
@@ -56,7 +56,7 @@ variables:
 # Lassen and Butte use a different job scheduler (spectrum lsf) that does not
 # allow pre-allocation the same way slurm does.
 # Arguments for job level allocation
-  LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
+  LASSEN_JOB_ALLOC: "1 -W 40 -q pci"
 # Project specific variants for lassen
   PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
 # Project specific deps for lassen
 
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -46,21 +46,21 @@ Here a simple sum reduction is performed in a for loop::
 
 The results of these operations will yield the following values:
 
- * vsum == 1000
+ * ``vsum == 1000``
 
 RAJA uses policy types to specify how things are implemented.
 
 The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
 For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
-``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with
+``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with
 256 threads per block and other CUDA kernel launch parameters, like the
 number of blocks, optimized for performance with reducers.::
 
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
   // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
-  // using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>;
-  // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
 The reduction policy specifies how the reduction is done and must match the
@@ -90,4 +90,21 @@ Here a simple sum reduction is performed using RAJA::
 
 The results of these operations will yield the following values:
 
- * vsum.get() == 1000
+ * ``vsum.get() == 1000``
+
+
+Another option for the execution policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the general
+use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
+
+  // static constexpr bool with_reduce = ...;
+  // using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
+  // using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
+
+Another option for the reduction policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the atomic
+``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
+
+  // static constexpr bool with_atomic = ...;
+  // using reduce_policy = RAJA::cuda_reduce_base<with_atomic>;
+  // using reduce_policy = RAJA::hip_reduce_base<with_atomic>;
@@ -33,6 +33,7 @@
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/math.hpp"
 #include "RAJA/util/plugins.hpp"
 #include "RAJA/util/Registry.hpp"
 #include "RAJA/util/for_each.hpp"
@@ -156,6 +157,11 @@
 //
 #include "RAJA/util/sort.hpp"
 
+//
+// reduce algorithms
+//
+#include "RAJA/util/reduce.hpp"
+
 //
 // WorkPool, WorkGroup, WorkSite objects
 //
 
@@ -61,7 +61,7 @@ struct PinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFreeHost(ptr));
@@ -80,7 +80,7 @@ struct DeviceAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -103,7 +103,31 @@ struct DeviceZeroedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFree(ptr));
+    return true;
+  }
+};
+
+//! Allocator for device pinned memory for use in basic_mempool
+struct DevicePinnedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    int device;
+    cudaErrchk(cudaGetDevice(&device));
+    void* ptr;
+    cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+
+    return ptr;
+  }
+
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -114,6 +138,7 @@ struct DeviceZeroedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail