From 2d9dc6dc28ec91aa5e5930fff0159038db107196 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 12 Mar 2024 11:04:30 +0100
Subject: [PATCH 001/108] Update Uberenv + From RSC: Starting the process to
 move to spack environments

---
 scripts/radiuss-spack-configs | 2 +-
 scripts/uberenv               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index af75606a7f..11e8d91e90 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124
+Subproject commit 11e8d91e9093ac237fe00dab36470e24545e4e77
diff --git a/scripts/uberenv b/scripts/uberenv
index 4941c237ee..0a39ce245d 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d
+Subproject commit 0a39ce245d7866374bf4724bec9da6ab4cf4dfcc

From 053300031f2bb755b7ccd41b2e3712b1112b3994 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 12 Mar 2024 15:23:25 +0100
Subject: [PATCH 002/108] From RSC: Merge config.yaml content into the
 environment files

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 11e8d91e90..1dc78a3d2f 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 11e8d91e9093ac237fe00dab36470e24545e4e77
+Subproject commit 1dc78a3d2f8ee919af0127d979e3c0ca4a63df38

From fe6c45ae446166eddb5fdeeae5a0cc86ac7bd130 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 12 Mar 2024 15:56:27 +0100
Subject: [PATCH 003/108] From RSC: Fix config section names

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 1dc78a3d2f..eb964123fd 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 1dc78a3d2f8ee919af0127d979e3c0ca4a63df38
+Subproject commit eb964123fd422b1bc6cb848f482536009e15a393

From 32c09fa95d5e4fd64c6b6fb6bf1a52bee36d34b0 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 21 Mar 2024 11:01:48 +0100
Subject: [PATCH 004/108] From RSC: Update RADIUSS CI jobs with new compilers
 on lassen

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index af75606a7f..d8869052d1 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124
+Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c

From dcc46c687f788900ed5882b850c106ab07f825e5 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 21 Mar 2024 22:14:59 +0100
Subject: [PATCH 005/108] From RSC: Apply changes to reduce the gap with spack
 packages

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index d8869052d1..55a4821edc 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c
+Subproject commit 55a4821edce8dbad4ef4f36b7e0c34e04984ab74

From d51e0888e5f970107fdc6eecb50a6f8e15ff944b Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Fri, 22 Mar 2024 17:07:53 +0100
Subject: [PATCH 006/108] Restore clang 12.0.1 jobs in lassen CI, fix
 spectrum-mpi paths, enforce compiler versions with @=

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index d8869052d1..9fe6f19c7c 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c
+Subproject commit 9fe6f19c7c0900acea9daabf7796798193551773

From fc5ec7b414d53365123959abd5f72a768466fb3c Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 26 Mar 2024 11:29:56 +0100
Subject: [PATCH 007/108] From RSC: changes for Caliper

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 316fffdf09..8938041fb2 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 316fffdf099576b8e90fee06834b7dd66898c49b
+Subproject commit 8938041fb20dde5e55ae2014aa71333076d139c9

From a868503d8085772dd5f9212800e2980eca4dec6c Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 26 Mar 2024 16:04:54 +0100
Subject: [PATCH 008/108] From RSC: Apply new changes from Spack PR + activate
 vectorization in CI Vectorization support now defaults to false in RAJA Spack
 package

---
 .gitlab/custom-jobs-and-variables.yml | 10 +++++-----
 scripts/radiuss-spack-configs         |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index eb7011b78a..f652bb2caf 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -19,7 +19,7 @@ variables:
 # Note: We repeat the reservation, necessary when jobs are manually re-triggered.
   RUBY_JOB_ALLOC: "--reservation=ci --nodes=1"
 # Project specific variants for ruby
-  PROJECT_RUBY_VARIANTS: "~shared +openmp +tests"
+  PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for ruby
   PROJECT_RUBY_DEPS: ""
 
@@ -29,7 +29,7 @@ variables:
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle
-  PROJECT_POODLE_VARIANTS: "~shared +openmp +tests"
+  PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for poodle
   PROJECT_POODLE_DEPS: ""
 
@@ -39,7 +39,7 @@ variables:
 # Arguments for job level allocation
   CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
-  PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests"
+  PROJECT_CORONA_VARIANTS: "~shared ~openmp +vectorization +tests"
 # Project specific deps for corona
   PROJECT_CORONA_DEPS: "^blt@develop "
 
@@ -49,7 +49,7 @@ variables:
 # Arguments for job level allocation
   TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
-  PROJECT_TIOGA_VARIANTS: "~shared ~openmp +tests"
+  PROJECT_TIOGA_VARIANTS: "~shared ~openmp +vectorization +tests"
 # Project specific deps for corona
   PROJECT_TIOGA_DEPS: "^blt@develop "
 
@@ -58,7 +58,7 @@ variables:
 # Arguments for job level allocation
   LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
 # Project specific variants for lassen
-  PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests cuda_arch=70"
+  PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
 # Project specific deps for lassen
   PROJECT_LASSEN_DEPS: "^blt@develop "
 
diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 8938041fb2..3d7465cecf 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 8938041fb20dde5e55ae2014aa71333076d139c9
+Subproject commit 3d7465cecf1285064df8a19668ccc66e24b9b388

From ab756d507ac0ea3c7f5c342d30ed908a06100dbc Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 26 Mar 2024 16:55:36 +0100
Subject: [PATCH 009/108] From RSC: Fix variable name

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 3d7465cecf..6b706f2d20 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 3d7465cecf1285064df8a19668ccc66e24b9b388
+Subproject commit 6b706f2d20d608ea2a9c5e4bf5d6412345b4bd4a

From 34d63081d0bf291f77944f8e491fffff5ec6a774 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Wed, 27 Mar 2024 09:56:54 +0100
Subject: [PATCH 010/108] From RSC: Remove CUDA_ARCH, Fix MPI utility function
 (used by RAJAPerf)

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 6b706f2d20..16c942203d 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 6b706f2d20d608ea2a9c5e4bf5d6412345b4bd4a
+Subproject commit 16c942203dd4dc42d3e030d7d643c7b5c3f4108b

From cbf7ee3ab9e65c476878f70c75af9c0d756ef5eb Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 28 Mar 2024 10:30:45 +0100
Subject: [PATCH 011/108] From RSC: Restore basic MPI support in RAJAPerf

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 16c942203d..5a2b0e7a0b 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 16c942203dd4dc42d3e030d7d643c7b5c3f4108b
+Subproject commit 5a2b0e7a0b42d1585d07ee81b78149a3aa5c5544

From c2d87e1bf5ed1710e11a65710460e5480a52d3fe Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 28 Mar 2024 11:20:22 +0100
Subject: [PATCH 012/108] From RSC: RAJAPerf, Umpire, Caliper MPI handling like
 Axom

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 5a2b0e7a0b..841f99671c 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 5a2b0e7a0b42d1585d07ee81b78149a3aa5c5544
+Subproject commit 841f99671ca0d8bf040f48b07649f06aa0431f51

From 808c7e1c438c8dee2ff033447b3fa4f9263139c0 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 28 Mar 2024 11:41:47 +0100
Subject: [PATCH 013/108] From RSC: Fix missing import

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 841f99671c..63ac13d2ac 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 841f99671ca0d8bf040f48b07649f06aa0431f51
+Subproject commit 63ac13d2acfa0d164b51e193f0e56c48b52afe0d

From d18bf5eea12463af5c57703e271bcd9112207a31 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 28 Mar 2024 16:04:26 +0100
Subject: [PATCH 014/108] From RSC: Fix calling class function from outside +
 clean super arguments

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 63ac13d2ac..fe4b00160f 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 63ac13d2acfa0d164b51e193f0e56c48b52afe0d
+Subproject commit fe4b00160f09f8dd4b9d32aff396f3ba0ac8a1e2

From a17c1be9fbc88c031382bf747dd5a56cf5f96cdb Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 28 Mar 2024 18:17:51 +0100
Subject: [PATCH 015/108] Update uberenv

---
 scripts/uberenv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/uberenv b/scripts/uberenv
index 0a39ce245d..cf91883ef0 160000
--- a/scripts/uberenv
+++ b/scripts/uberenv
@@ -1 +1 @@
-Subproject commit 0a39ce245d7866374bf4724bec9da6ab4cf4dfcc
+Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3

From f3ecdc2db1a28df43bdf98c7387ee6938b7790a1 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Fri, 29 Mar 2024 16:53:28 +0100
Subject: [PATCH 016/108] From RSC: Fix: use slurm on toss4 cray machines

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 056c003a1e..b09f869f9d 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 056c003a1ed89c301867813a0c20aeb337fc1d6e
+Subproject commit b09f869f9d9aff6ecf6544a0161d96c2b18d13b8

From 304224d7da981a89c85dc77c95a9dbda34505f32 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 31 Mar 2024 11:41:47 -0700
Subject: [PATCH 017/108] Add compile time fraction class and test

---
 include/RAJA/util/types.hpp      | 19 ++++++++++
 test/unit/util/CMakeLists.txt    |  4 ++
 test/unit/util/test-fraction.cpp | 64 ++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)
 create mode 100644 test/unit/util/test-fraction.cpp

diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 811f681b9b..3c1aeaf042 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -172,6 +172,25 @@ struct SizeList {
 };
 
 
+///
+/// Compile time fraction for use with integral types
+///
+template <typename int_t, int_t numerator, int_t denominator>
+struct Fraction
+{
+  static_assert(denominator != int_t(0), "denominator may not be zero");
+
+  using inverse = Fraction<int_t, denominator, numerator>;
+
+  static constexpr int_t multiply(int_t val) noexcept
+  {
+    return (val / denominator) * numerator +
+           (val % denominator) * numerator / denominator;
+  }
+
+};
+
+
 /*!
  ******************************************************************************
  *
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index fdec220da9..869b897714 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -21,4 +21,8 @@ raja_add_test(
   NAME test-span
   SOURCES test-span.cpp)
 
+raja_add_test(
+  NAME test-fraction
+  SOURCES test-fraction.cpp)
+
 add_subdirectory(operator)
diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp
new file mode 100644
index 0000000000..5161b2bb3a
--- /dev/null
+++ b/test/unit/util/test-fraction.cpp
@@ -0,0 +1,64 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for Fraction
+///
+
+#include <RAJA/RAJA.hpp>
+#include "RAJA_gtest.hpp"
+#include <type_traits>
+
+template <typename IntegerType, IntegerType numerator, IntegerType denominator>
+void testFractionMultiplyTypesValues()
+{
+  using Frac = RAJA::Fraction<IntegerType, numerator, denominator>;
+
+  ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(1)),
+            IntegerType(double(numerator) / double(denominator)));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(100)),
+            IntegerType(double(numerator) / double(denominator) * double(100)));
+
+  ASSERT_EQ(Frac::multiply(IntegerType(101)),
+            IntegerType(double(numerator) / double(denominator) * double(101)));
+
+  // Test where naive algorithm causes overflow, when within precision of double
+  if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) {
+
+    static constexpr IntegerType max = std::numeric_limits<IntegerType>::max();
+    static constexpr IntegerType val = (numerator > denominator) ?
+        (max / numerator * denominator) : max;
+
+    ASSERT_EQ(Frac::multiply(IntegerType(val)),
+              IntegerType(double(numerator) / double(denominator) * double(val)));
+  }
+
+}
+
+template <typename IntegerType>
+void testFractionMultiplyTypes()
+{
+  testFractionMultiplyTypesValues<IntegerType, 1, 1>();
+  testFractionMultiplyTypesValues<IntegerType, 1, 2>();
+  testFractionMultiplyTypesValues<IntegerType, 1, 3>();
+  testFractionMultiplyTypesValues<IntegerType, 2, 3>();
+  testFractionMultiplyTypesValues<IntegerType, 12, 7>();
+  testFractionMultiplyTypesValues<IntegerType, 0, 100>();
+}
+
+
+#define RAJA_FRACTION_RUN_TEST(test) \
+  test<int>(); \
+  test<size_t>();
+
+TEST(Fraction, basic_multiply_Fraction)
+{
+  RAJA_FRACTION_RUN_TEST(testFractionMultiplyTypes)
+}

From 2d490a54ed511dcd8f1563a07687da74dac7fdcc Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 31 Mar 2024 14:01:16 -0700
Subject: [PATCH 018/108] Add occ_calc_fraction cuda/hip policies

Template iteration_mapping types to allow a modifying fraction
to be added that is used when calculating the max number of blocks
to launch of kernels where the number of blocks is not specified.
---
 include/RAJA/policy/cuda/forall.hpp           | 116 ++++++++++++++----
 include/RAJA/policy/cuda/kernel/For.hpp       |  14 +--
 include/RAJA/policy/cuda/kernel/ForICount.hpp |  20 +--
 include/RAJA/policy/cuda/kernel/Tile.hpp      |  14 +--
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |  20 +--
 include/RAJA/policy/cuda/kernel/internal.hpp  |  36 +++---
 include/RAJA/policy/cuda/launch.hpp           |  48 ++++----
 include/RAJA/policy/cuda/policy.hpp           |  55 ++++++---
 include/RAJA/policy/hip/forall.hpp            | 116 ++++++++++++++----
 include/RAJA/policy/hip/kernel/For.hpp        |  14 +--
 include/RAJA/policy/hip/kernel/ForICount.hpp  |  20 +--
 include/RAJA/policy/hip/kernel/Tile.hpp       |  14 +--
 include/RAJA/policy/hip/kernel/TileTCount.hpp |  20 +--
 include/RAJA/policy/hip/kernel/internal.hpp   |  36 +++---
 include/RAJA/policy/hip/launch.hpp            |  48 ++++----
 include/RAJA/policy/hip/policy.hpp            |  34 +++--
 include/RAJA/util/types.hpp                   |  25 +++-
 test/include/RAJA_test-forall-execpol.hpp     |   6 +-
 18 files changed, 415 insertions(+), 241 deletions(-)

diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 3837a8b062..c2ddd67505 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -55,6 +55,57 @@ namespace cuda
 namespace impl
 {
 
+/*!
+ ******************************************************************************
+ *
+ * \brief  Cuda grid dimension helper for strided loops template.
+ *
+ * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid
+ *         stride loop to run multiple iterates in a single thread.
+ *
+ ******************************************************************************
+ */
+template<typename IterationMapping>
+struct GridStrideHelper;
+
+/// handle direct policies with no modifiers
+template<>
+struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size))
+  {
+    return normal_grid_size;
+  }
+};
+
+/// handle strided loop policies with no modifiers
+template<>
+struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
+    named_usage::unspecified>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
+  {
+    return std::min(normal_grid_size, max_grid_size);
+  }
+};
+
+/// handle strided loop policies with multiplier on iterates per thread
+template<typename FractionIdxT, FractionIdxT numerator, FractionIdxT demoninator>
+struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
+    named_usage::unspecified, Fraction<FractionIdxT, numerator, demoninator>>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
+  {
+    // use inverse multiplier on max grid size to affect number of threads
+    using Frac = typename Fraction<IdxT, IdxT(numerator), IdxT(demoninator)>::inverse;
+    max_grid_size = Frac::multiply(max_grid_size);
+    return std::min(normal_grid_size, max_grid_size);
+  }
+};
+
 /*!
  ******************************************************************************
  *
@@ -77,13 +128,14 @@ struct ForallDimensionCalculator;
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -101,12 +153,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
@@ -120,12 +173,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
@@ -138,11 +192,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
                                  UniqueMarker>
 {
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
+
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -157,13 +213,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -176,12 +233,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
@@ -201,13 +259,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
+  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
   using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -218,7 +277,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
     auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
                                               static_cast<IdxT>(IndexMapper::block_size));
 
-    IdxT calculated_grid_size = std::min(
+    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
         RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
         static_cast<IdxT>(max_grid_size));
 
@@ -227,11 +286,12 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
                                  UniqueMarker>
 {
+  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
   using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -241,7 +301,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
     ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
     auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
 
-    IdxT calculated_grid_size = std::min(
+    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
         RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
         static_cast<IdxT>(max_sizes.second));
 
@@ -273,7 +333,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -298,7 +358,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -324,7 +384,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -352,7 +412,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -379,7 +439,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -405,7 +466,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -433,7 +495,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
@@ -462,7 +525,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 11870f13b0..90c26faca6 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -108,7 +108,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -123,7 +123,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -180,7 +180,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -195,7 +195,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -246,7 +246,7 @@ struct CudaStatementExecutor<
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
 : CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index dd7c4c4ffe..be0d15feb3 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -47,20 +47,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -103,20 +103,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -166,20 +166,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -226,7 +226,7 @@ struct CudaStatementExecutor<
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
 : CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad54c86a54..615c9943c2 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -143,7 +143,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -153,7 +153,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -233,7 +233,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -243,7 +243,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -318,7 +318,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 : CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 84a0bec412..6b6b7b3197 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -131,14 +131,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -146,7 +146,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -209,14 +209,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -224,7 +224,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -281,7 +281,7 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
 : CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index a33b564309..ae0e442cdf 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -217,7 +217,7 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -234,7 +234,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -271,7 +271,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -307,7 +307,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -343,7 +343,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -362,7 +362,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
@@ -388,7 +388,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -402,7 +402,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -418,7 +418,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -436,7 +436,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -451,7 +451,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -469,7 +469,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -488,7 +488,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -508,7 +508,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -527,7 +527,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 26e56e5cda..5dba388d06 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -348,7 +348,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -371,7 +371,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -399,7 +399,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -433,7 +433,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -457,7 +457,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -493,7 +493,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -538,7 +538,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -560,7 +560,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -590,7 +590,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -625,7 +625,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -649,7 +649,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -686,7 +686,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -736,18 +736,18 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
    CUDA generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -777,7 +777,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -810,18 +810,18 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -852,7 +852,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -890,7 +890,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -914,7 +914,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -939,7 +939,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -964,7 +964,7 @@ struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 92c1f1c701..4a5875a769 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -883,53 +883,70 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 } // namespace cuda
 
 // policies usable with forall, scan, and sort
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+using cuda_exec_occ_calc_fraction_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
+using cuda_exec_occ_calc_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+using cuda_exec_occ_calc_fraction = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, typename Fraction>
+using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
@@ -960,11 +977,11 @@ using policy::cuda::cuda_block_reduce;
 using policy::cuda::cuda_warp_reduce;
 
 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
 
@@ -990,31 +1007,31 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
 // policies usable with kernel and launch
 template < typename ... indexers >
 using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
 template < typename ... indexers >
 using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index b0b86131ef..2f9830bb31 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -56,6 +56,57 @@ namespace hip
 namespace impl
 {
 
+/*!
+ ******************************************************************************
+ *
+ * \brief  Hip grid dimension helper for strided loops template.
+ *
+ * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid
+ *         stride loop to run multiple iterates in a single thread.
+ *
+ ******************************************************************************
+ */
+template<typename IterationMapping>
+struct GridStrideHelper;
+
+/// handle direct policies with no modifiers
+template<>
+struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size))
+  {
+    return normal_grid_size;
+  }
+};
+
+/// handle strided loop policies with no modifiers
+template<>
+struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
+    named_usage::unspecified>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
+  {
+    return std::min(normal_grid_size, max_grid_size);
+  }
+};
+
+/// handle strided loop policies with multiplier on iterates per thread
+template<typename FractionIdxT, FractionIdxT numerator, FractionIdxT demoninator>
+struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
+    named_usage::unspecified, Fraction<FractionIdxT, numerator, demoninator>>>
+{
+  template < typename IdxT >
+  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
+  {
+    // use inverse multiplier on max grid size to affect number of threads
+    using Frac = typename Fraction<IdxT, IdxT(numerator), IdxT(demoninator)>::inverse;
+    max_grid_size = Frac::multiply(max_grid_size);
+    return std::min(normal_grid_size, max_grid_size);
+  }
+};
+
 /*!
  ******************************************************************************
  *
@@ -78,13 +129,14 @@ struct ForallDimensionCalculator;
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -102,12 +154,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
@@ -121,12 +174,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
@@ -139,11 +193,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
+template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
                                  UniqueMarker>
 {
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
+
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -158,13 +214,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -177,12 +234,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
@@ -202,13 +260,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
+  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
   using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -219,7 +278,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
     auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
                                               static_cast<IdxT>(IndexMapper::block_size));
 
-    IdxT calculated_grid_size = std::min(
+    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
         RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
         static_cast<IdxT>(max_grid_size));
 
@@ -228,11 +287,12 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
   }
 };
 
-template<named_dim dim, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
+template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
                                  UniqueMarker>
 {
+  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
   using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
@@ -242,7 +302,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop,
     ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
     auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
 
-    IdxT calculated_grid_size = std::min(
+    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
         RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
         static_cast<IdxT>(max_sizes.second));
 
@@ -273,7 +333,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -297,7 +357,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -322,7 +382,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -349,7 +409,7 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::Direct>::value &&
+                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -375,7 +435,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -400,7 +461,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
@@ -427,7 +489,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size > 0),
               size_t > BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
@@ -455,7 +518,8 @@ template <typename EXEC_POL,
           typename IterationMapping = typename EXEC_POL::IterationMapping,
           typename IterationGetter = typename EXEC_POL::IterationGetter,
           std::enable_if_t<
-                std::is_same<IterationMapping, iteration_mapping::StridedLoop>::value &&
+                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
+                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
                 (IterationGetter::block_size <= 0),
               size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
 __global__
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index ce8e87d869..10563bc20e 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -108,7 +108,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -123,7 +123,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -180,7 +180,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -195,7 +195,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
 
   static inline RAJA_DEVICE
@@ -246,7 +246,7 @@ struct HipStatementExecutor<
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
 : HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index 001cc28b77..be7e256274 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -47,20 +47,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -103,20 +103,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -166,20 +166,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
@@ -226,7 +226,7 @@ struct HipStatementExecutor<
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
 : HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                      kernel_sync_requirement::none,
                                      hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
       EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 24f38b7647..51a199226f 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -143,7 +143,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -153,7 +153,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -233,7 +233,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>, Types>
   {
 
@@ -243,7 +243,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
@@ -318,7 +318,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
 : HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index c92f92fb71..72e4114a23 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -131,14 +131,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -146,7 +146,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -209,14 +209,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -224,7 +224,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop, kernel_sync_requirement::none, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
@@ -281,7 +281,7 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
 : HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                    kernel_sync_requirement::none,
                                    hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
     EnclosedStmts...>, Types>
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 2c93520b93..1c520d4af9 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -217,7 +217,7 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -234,7 +234,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -271,7 +271,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -307,7 +307,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -343,7 +343,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -362,7 +362,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
@@ -388,7 +388,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -402,7 +402,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -418,7 +418,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -436,7 +436,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -451,7 +451,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -469,7 +469,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -488,7 +488,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -508,7 +508,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -527,7 +527,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 2e54b16a81..8f605cb538 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -348,7 +348,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -371,7 +371,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -399,7 +399,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -433,7 +433,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -457,7 +457,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -493,7 +493,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -538,7 +538,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -560,7 +560,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -590,7 +590,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -625,7 +625,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -649,7 +649,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -686,7 +686,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -736,18 +736,18 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
    HIP generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -777,7 +777,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -810,18 +810,18 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -852,7 +852,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -890,7 +890,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -914,7 +914,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -939,7 +939,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -964,7 +964,7 @@ struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 75f9abd878..9c72cc8993 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -879,27 +879,35 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 // policies usable with forall, scan, and sort
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::Direct<>, hip::global_x<BLOCK_SIZE>, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::Direct<>, hip::global_x<BLOCK_SIZE>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>, true>;
+
+template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+using hip_exec_occ_calc_fraction = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, Async>;
+
+template <size_t BLOCK_SIZE, typename Fraction>
+using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, true>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -923,11 +931,11 @@ using policy::hip::hip_block_reduce;
 using policy::hip::hip_warp_reduce;
 
 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
 using hip_warp_loop = RAJA::policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
 
@@ -947,31 +955,31 @@ using policy::hip::hip_launch_t;
 // policies usable with kernel and launch
 template < typename ... indexers >
 using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using hip_indexer_loop = policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
 template < typename ... indexers >
 using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
+    iteration_mapping::Direct<>,
     kernel_sync_requirement::none,
     indexers...>;
 
 template < typename ... indexers >
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
-    iteration_mapping::StridedLoop,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 3c1aeaf042..95b139bce5 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -67,6 +67,18 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
+struct DirectBase {};
+struct LoopBase {};
+struct ContiguousLoopBase : LoopBase {};
+struct StridedLoopBase : LoopBase {};
+struct UnsizedLoopBase {};
+struct SizedLoopBase {};
+template < size_t t_max_iterations >
+struct SizedLoopSpecifyingBase : SizedLoopBase
+{
+  static constexpr size_t max_iterations = t_max_iterations;
+};
+
 ///
 /// Direct assumes the loop has enough iterations for all of the indices and
 /// maps directly from an iteration to an index.
@@ -88,7 +100,8 @@ namespace iteration_mapping
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct {};
+template < typename ... Modifiers >
+struct Direct : DirectBase {};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -115,7 +128,10 @@ struct Direct {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-struct Contiguousloop {};
+template < size_t max_iterations, typename ... Modifiers >
+struct Contiguousloop : ContiguousLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -142,7 +158,10 @@ struct Contiguousloop {};
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-struct StridedLoop {};
+template < size_t max_iterations, typename ... Modifiers >
+struct StridedLoop : StridedLoopBase,
+    std::conditional_t<(max_iterations != named_usage::unspecified),
+                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
 
 } // namespace iteration_mapping
 
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 2fe790ff93..33cc17f7eb 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -108,7 +108,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols;
 using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
                                        RAJA::cuda_exec_occ_calc<256>,
                                        RAJA::cuda_exec_grid<256, 64>,
-                                       RAJA::cuda_exec_explicit<256,2> >;
+                                       RAJA::cuda_exec_explicit<256,2>,
+                                       RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction<int,1,2>> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -119,7 +120,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 #if defined(RAJA_ENABLE_HIP)
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
                                       RAJA::hip_exec_occ_calc<256>,
-                                      RAJA::hip_exec_grid<256, 64>  >;
+                                      RAJA::hip_exec_grid<256, 64>,
+                                      RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction<int,1,2>> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From 95a5d07e120bbfdaf7def723f4ba0f4030a0abc6 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 31 Mar 2024 14:34:26 -0700
Subject: [PATCH 019/108] Add cuda/hip_occ_calc_recommended policies

These policies will represent the recommended way to use the
occupancy calculator.
---
 include/RAJA/policy/cuda/policy.hpp | 16 ++++++++++++++++
 include/RAJA/policy/hip/policy.hpp  |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 4a5875a769..90341b9095 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -948,6 +948,22 @@ template <size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_occ_calc_recommended_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_occ_calc_recommended_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_occ_calc_recommended = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE>
+using cuda_exec_occ_calc_recommended_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 9c72cc8993..53ce01dc9f 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -909,6 +909,14 @@ template <size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, true>;
 
+template <size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_occ_calc_recommended = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified, Fraction<size_t, 2, 1>>, hip::global_x<BLOCK_SIZE>, Async>;
+
+template <size_t BLOCK_SIZE>
+using hip_exec_occ_calc_recommended_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified, Fraction<size_t, 2, 1>>, hip::global_x<BLOCK_SIZE>, true>;
+
 // policies usable with WorkGroup
 using policy::hip::hip_work;
 

From 405446048d722b90fdea8c9072a3c2ad8ab4ed38 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 1 Apr 2024 13:40:08 -0700
Subject: [PATCH 020/108] empty


From ec5a68c55b16a1d769465877b8d62437136c9b4c Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 2 Apr 2024 11:16:37 -0700
Subject: [PATCH 021/108] Add some documentation

---
 docs/sphinx/user_guide/feature/policies.rst | 7 +++++++
 include/RAJA/util/basic_mempool.hpp         | 1 +
 2 files changed, 8 insertions(+)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index e61be4e598..53ed56bbe1 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -271,6 +271,13 @@ policies have the prefix ``hip_``.
                                                          default. Note this can improve
                                                          reducer performance in kernels
                                                          with large iteration counts.
+ cuda/hip_exec_occ_calc_recommended<BLOCK_SIZE> forall   The same as
+                                                         cuda/hip_exec_occ_calc<BLOCK_SIZE>
+                                                         except the grid size upper bound
+                                                         may be modified from the
+                                                         maximum occupancy to improve performance.
+                                                         Note this is the recommended
+                                                         policy to use with reducers.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index 61624e0725..f0208ccbd3 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -309,6 +309,7 @@ class MemPool
   }
 
 
+  /// Free all backing allocations, even if they are currently in use
   void free_chunks()
   {
 #if defined(RAJA_ENABLE_OPENMP)

From dd89cd4d82faa7d26c31fa24ebfcbfafd64b95a3 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 2 Apr 2024 14:25:18 -0700
Subject: [PATCH 022/108] change type used to match recommended policy

---
 test/include/RAJA_test-forall-execpol.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 33cc17f7eb..458e6d06d0 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -109,7 +109,7 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
                                        RAJA::cuda_exec_occ_calc<256>,
                                        RAJA::cuda_exec_grid<256, 64>,
                                        RAJA::cuda_exec_explicit<256,2>,
-                                       RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction<int,1,2>> >;
+                                       RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction<size_t,1,2>> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -121,7 +121,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
                                       RAJA::hip_exec_occ_calc<256>,
                                       RAJA::hip_exec_grid<256, 64>,
-                                      RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction<int,1,2>> >;
+                                      RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction<size_t,1,2>> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From 014aebac5b16c654dfb236b78668217a2dfed390 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 2 Apr 2024 14:26:01 -0700
Subject: [PATCH 023/108] rename recommended policy

cuda/hip_exec_occ_calc_recommended changed to
cuda/hip_exec_rec_for_reduce
---
 docs/sphinx/user_guide/feature/policies.rst | 10 +++-------
 include/RAJA/policy/cuda/policy.hpp         | 12 ++++--------
 include/RAJA/policy/hip/policy.hpp          |  6 ++----
 3 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 53ed56bbe1..d3f982951a 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -271,13 +271,9 @@ policies have the prefix ``hip_``.
                                                          default. Note this can improve
                                                          reducer performance in kernels
                                                          with large iteration counts.
- cuda/hip_exec_occ_calc_recommended<BLOCK_SIZE> forall   The same as
-                                                         cuda/hip_exec_occ_calc<BLOCK_SIZE>
-                                                         except the grid size upper bound
-                                                         may be modified from the
-                                                         maximum occupancy to improve performance.
-                                                         Note this is the recommended
-                                                         policy to use with reducers.
+ cuda/hip_exec_rec_for_reduce<BLOCK_SIZE>  forall        The cuda/hip exec policy
+                                                         that is recommended for
+                                                         use with reducers.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 90341b9095..8e98deeaf2 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -949,20 +949,16 @@ using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_occ_calc_recommended_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+using cuda_exec_rec_for_reduce_explicit = cuda_exec_occ_calc_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_occ_calc_recommended_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+using cuda_exec_rec_for_reduce_explicit_async = cuda_exec_occ_calc_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_occ_calc_recommended = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec_rec_for_reduce = cuda_exec_occ_calc<BLOCK_SIZE, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_occ_calc_recommended_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_rec_for_reduce_async = cuda_exec_occ_calc_async<BLOCK_SIZE>;
 
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 53ce01dc9f..49cd489be4 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -910,12 +910,10 @@ using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_occ_calc_recommended = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified, Fraction<size_t, 2, 1>>, hip::global_x<BLOCK_SIZE>, Async>;
+using hip_exec_rec_for_reduce = hip_exec_occ_calc_fraction<BLOCK_SIZE, Fraction<size_t, 1, 2>, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_occ_calc_recommended_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified, Fraction<size_t, 2, 1>>, hip::global_x<BLOCK_SIZE>, true>;
+using hip_exec_rec_for_reduce_async = hip_exec_occ_calc_fraction_async<BLOCK_SIZE, Fraction<size_t, 1, 2>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;

From 9cbba9148dfe0bff03a47bd131d16532e80a7404 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 12:10:29 -0700
Subject: [PATCH 024/108] Add Concretizer

Remove modifiers from loop iteration mappings
and move the occupancy calculator modifications into
Concretizer classes that are used when block size or grid size
is not specified in the ForallDimensionCalculator.
---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp    | 253 +++++++++--------
 include/RAJA/policy/cuda/forall.hpp           | 217 ++++++--------
 .../RAJA/policy/cuda/kernel/CudaKernel.hpp    |  27 +-
 include/RAJA/policy/cuda/kernel/For.hpp       |   4 +-
 include/RAJA/policy/cuda/kernel/ForICount.hpp |   6 +-
 include/RAJA/policy/cuda/kernel/Tile.hpp      |   4 +-
 .../RAJA/policy/cuda/kernel/TileTCount.hpp    |   6 +-
 include/RAJA/policy/cuda/kernel/internal.hpp  |  18 +-
 include/RAJA/policy/cuda/launch.hpp           |  24 +-
 include/RAJA/policy/cuda/policy.hpp           | 205 ++++++++++++--
 include/RAJA/policy/cuda/scan.hpp             |  12 +-
 include/RAJA/policy/cuda/sort.hpp             |  66 +++--
 include/RAJA/policy/hip/MemUtils_HIP.hpp      | 264 ++++++++++--------
 include/RAJA/policy/hip/forall.hpp            | 217 ++++++--------
 include/RAJA/policy/hip/kernel.hpp            |   2 +-
 include/RAJA/policy/hip/kernel/For.hpp        |   4 +-
 include/RAJA/policy/hip/kernel/ForICount.hpp  |   6 +-
 include/RAJA/policy/hip/kernel/HipKernel.hpp  |  27 +-
 include/RAJA/policy/hip/kernel/Tile.hpp       |   4 +-
 include/RAJA/policy/hip/kernel/TileTCount.hpp |   6 +-
 include/RAJA/policy/hip/kernel/internal.hpp   |  18 +-
 include/RAJA/policy/hip/launch.hpp            |  24 +-
 include/RAJA/policy/hip/policy.hpp            | 154 ++++++++--
 include/RAJA/policy/hip/scan.hpp              |  12 +-
 include/RAJA/policy/hip/sort.hpp              |  66 +++--
 include/RAJA/util/resource.hpp                |  20 +-
 include/RAJA/util/types.hpp                   |   8 +-
 test/include/RAJA_test-forall-execpol.hpp     |   6 +-
 28 files changed, 986 insertions(+), 694 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 2a8f848825..7eee19dacf 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -299,184 +299,207 @@ cudaDeviceProp& device_prop()
 
 struct CudaFixedMaxBlocksData
 {
-  int multiProcessorCount;
-  int maxThreadsPerMultiProcessor;
+  int device_sm_per_device;
+  int device_max_threads_per_sm;
 };
 
 RAJA_INLINE
-size_t cuda_max_blocks(size_t block_size)
+CudaFixedMaxBlocksData cuda_max_blocks()
 {
-  static CudaFixedMaxBlocksData data = []() {
-    cudaDeviceProp& prop = cuda::device_prop();
-    return CudaFixedMaxBlocksData{prop.multiProcessorCount,
-                                  prop.maxThreadsPerMultiProcessor};
-  }();
+  static thread_local CudaFixedMaxBlocksData data {
+      cuda::device_prop().multiProcessorCount,
+      cuda::device_prop().maxThreadsPerMultiProcessor };
 
-  size_t max_blocks = data.multiProcessorCount *
-                  (data.maxThreadsPerMultiProcessor / block_size);
-
-  return max_blocks;
+  return data;
 }
 
 struct CudaOccMaxBlocksThreadsData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int max_threads;
+  size_t func_dynamic_shmem_per_block;
+  int func_max_blocks_per_device;
+  int func_max_threads_per_block;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
-                                       int &max_blocks, int &max_threads)
+CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local CudaOccMaxBlocksThreadsData data {
+      std::numeric_limits<size_t>::max(),
+      -1,
+      -1 };
 
-  if (data.prev_shmem_size != shmem_size) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.max_blocks, &data.max_threads, func, shmem_size));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 
-    data.prev_shmem_size = shmem_size;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
   }
 
-  max_blocks  = data.max_blocks;
-  max_threads = data.max_threads;
-
+  return data;
 }
 
-struct CudaOccMaxBlocksFixedThreadsData
+struct CudaOccMaxBlocksData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int multiProcessorCount;
+  size_t func_dynamic_shmem_per_block;
+  int func_threads_per_block;
+  int device_sm_per_device;
+  int device_max_threads_per_sm;
+  int func_max_blocks_per_sm;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks)
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksFixedThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
-
-  if (data.prev_shmem_size != shmem_size) {
+  static thread_local CudaOccMaxBlocksData data {
+      std::numeric_limits<size_t>::max(),
+      func_threads_per_block,
+      cuda::device_prop().multiProcessorCount,
+      cuda::device_prop().maxThreadsPerMultiProcessor,
+      -1 };
 
-    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
-
-    if (data.multiProcessorCount == uninitialized) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
-      data.multiProcessorCount = cuda::device_prop().multiProcessorCount;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size = shmem_size;
+    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct CudaOccMaxBlocksVariableThreadsData
-{
-  size_t prev_shmem_size;
-  int prev_num_threads;
-  int max_blocks;
-  int multiProcessorCount;
-};
-
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks, int num_threads)
+CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static constexpr int uninitialized = 0;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksVariableThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
-
-  if ( data.prev_shmem_size  != shmem_size ||
-       data.prev_num_threads != num_threads ) {
-
-    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.max_blocks, func, num_threads, shmem_size));
+  static thread_local CudaOccMaxBlocksData data {
+      std::numeric_limits<size_t>::max(),
+      -1,
+      cuda::device_prop().multiProcessorCount,
+      cuda::device_prop().maxThreadsPerMultiProcessor,
+      -1 };
 
-    if (data.multiProcessorCount == uninitialized) {
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
 
-      data.multiProcessorCount = cuda::device_prop().multiProcessorCount;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size  = shmem_size;
-    data.prev_num_threads = num_threads;
+    cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct CudaOccupancyDefaults
+/*!
+ ******************************************************************************
+ *
+ * \brief  Cuda Concretizer Implementation.
+ *
+ * \tparam IdxT Index type to use for integer calculations.
+ * \tparam Concretizer Class the determines the max number of blocks to use when
+ *         fitting for the device.
+ * \tparam UniqueMarker A type that is unique to each global function, used to
+ *         help cache the occupancy data for that global function.
+ *
+ ******************************************************************************
+ */
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl
 {
-  CudaOccupancyDefaults(const void* RAJA_UNUSED_ARG(func))
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
   { }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size),
-                                IdxT RAJA_UNUSED_ARG(block_size)) const
+  // Get the maximum block size
+  IdxT get_max_block_size() const
   {
-    return std::numeric_limits<IdxT>::max();
+    auto data = cuda_occupancy_max_blocks_threads<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block);
+    IdxT func_max_threads_per_block = data.func_max_threads_per_block;
+    return func_max_threads_per_block;
   }
 
-  template < typename IdxT = cuda_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const
+  // Get a block size that combined with the given grid size is large enough
+  // to do len work, or 0 if not possible
+  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
-    return std::make_pair(static_cast<IdxT>(::RAJA::policy::cuda::MAX_BLOCK_SIZE),
-                          std::numeric_limits<IdxT>::max());
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
+      return func_threads_per_block;
+    } else {
+      return IdxT(0);
+    }
   }
-};
 
-template < typename UniqueMarker >
-struct CudaOccupancyCalculator
-{
-  CudaOccupancyCalculator(const void* func)
-    : m_func(func)
-  { }
+  // Get a grid size that combined with the given block size is large enough
+  // to do len work
+  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
+  {
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return func_blocks_per_device;
+  }
+
+  // Get a block size and grid size that combined is large enough
+  // to do len work
+  auto get_block_and_grid_size_to_fit_len() const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
+  }
+
+  // Get a block size that combined with the given grid size is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    return std::min(func_threads_per_block, func_max_threads_per_block);
+  }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const
+  // Get a grid size that combined with the given block size is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    int max_grid_size = -1;
-    ::RAJA::cuda::cuda_occupancy_max_blocks<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, block_size);
-    return static_cast<IdxT>(max_grid_size);
+    auto data = cuda_occupancy_max_blocks<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const
+  // Get a block size and grid size that combined is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  auto get_block_and_grid_size_to_fit_device() const
   {
-    int max_block_size = -1;
-    int max_grid_size = -1;
-    ::RAJA::cuda::cuda_occupancy_max_blocks_threads<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, max_block_size);
-    return std::make_pair(static_cast<IdxT>(max_block_size),
-                          static_cast<IdxT>(max_grid_size));
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
   const void* m_func;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace cuda
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index c2ddd67505..333f0f90e8 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -55,57 +55,6 @@ namespace cuda
 namespace impl
 {
 
-/*!
- ******************************************************************************
- *
- * \brief  Cuda grid dimension helper for strided loops template.
- *
- * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid
- *         stride loop to run multiple iterates in a single thread.
- *
- ******************************************************************************
- */
-template<typename IterationMapping>
-struct GridStrideHelper;
-
-/// handle direct policies with no modifiers
-template<>
-struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size))
-  {
-    return normal_grid_size;
-  }
-};
-
-/// handle strided loop policies with no modifiers
-template<>
-struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
-    named_usage::unspecified>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
-  {
-    return std::min(normal_grid_size, max_grid_size);
-  }
-};
-
-/// handle strided loop policies with multiplier on iterates per thread
-template<typename FractionIdxT, FractionIdxT numerator, FractionIdxT demoninator>
-struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
-    named_usage::unspecified, Fraction<FractionIdxT, numerator, demoninator>>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
-  {
-    // use inverse multiplier on max grid size to affect number of threads
-    using Frac = typename Fraction<IdxT, IdxT(numerator), IdxT(demoninator)>::inverse;
-    max_grid_size = Frac::multiply(max_grid_size);
-    return std::min(normal_grid_size, max_grid_size);
-  }
-};
-
 /*!
  ******************************************************************************
  *
@@ -121,21 +70,21 @@ struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -143,8 +92,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifi
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    if ( len > (static_cast<IdxT>(IndexGetter::block_size) *
-                static_cast<IdxT>(IndexGetter::grid_size)) ) {
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    if ( len > (block_size * grid_size) ) {
       RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
@@ -153,160 +104,168 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifi
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    internal::set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::grid_size)));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
+
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    }
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::block_size)));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
-
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
+
+    internal::set_cuda_dim<dim>(dims.threads, sizes.first);
+    internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_block_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)),
-        static_cast<IdxT>(max_sizes.first));
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
-    internal::set_cuda_dim<dim>(dims.threads, calculated_block_size);
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
-  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
-                                              static_cast<IdxT>(IndexMapper::block_size));
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
-        static_cast<IdxT>(max_grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_cuda_dim<dim>(dims.threads, block_size);
+    internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
-  using IndexMapper = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::CudaDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::CudaOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
-        static_cast<IdxT>(max_sizes.second));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_cuda_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_cuda_dim<dim>(dims.threads, sizes.first);
+    internal::set_cuda_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
@@ -558,7 +517,7 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -566,7 +525,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
@@ -574,9 +533,9 @@ forall_impl(resources::Cuda cuda_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -627,7 +586,7 @@ forall_impl(resources::Cuda cuda_res,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -635,7 +594,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
@@ -643,9 +602,9 @@ forall_impl(resources::Cuda cuda_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -723,11 +682,11 @@ forall_impl(resources::Cuda cuda_res,
  */
 template <typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          size_t BlocksPerSM, bool Async,
+          typename Concretizer, size_t BlocksPerSM, bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Cuda>
 forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, Async>>,
+            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -736,7 +695,7 @@ forall_impl(resources::Cuda r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BlocksPerSM, true>(),
+                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 6497a64f42..c070d618ea 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -87,7 +87,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, 0, true>, EnclosedStmts...> {
+    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
 };
 
 
@@ -284,7 +284,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -294,8 +294,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         // determine blocks at runtime
         // determine threads at runtime
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
-            func, shmem_size, recommended_blocks, recommended_threads);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_threads = data.func_max_threads_per_block;
 
       } else {
 
@@ -305,8 +307,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         recommended_threads = num_threads;
 
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, recommended_blocks);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
@@ -360,7 +363,7 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -373,16 +376,18 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         // determine blocks when actual_threads != num_threads
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
-            func, shmem_size, max_blocks, actual_threads);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
+            func, shmem_size, actual_threads);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
 
         //
         // determine blocks when actual_threads == num_threads
         //
-        ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, max_blocks);
+        auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 90c26faca6..9de20c7b4b 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index be0d15feb3..8486abaa2c 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -47,20 +47,20 @@ template <typename Data,
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index 615c9943c2..ad901f6b02 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index 6b6b7b3197..c611346d46 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct CudaStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public CudaStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct CudaStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index ae0e442cdf..9c904ea45a 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -217,7 +217,7 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -234,7 +234,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -271,7 +271,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -307,7 +307,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -343,7 +343,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -362,7 +362,7 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 5dba388d06..602221e58a 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -348,7 +348,7 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
    CUDA generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -371,7 +371,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -399,7 +399,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -538,7 +538,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -560,7 +560,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -590,7 +590,7 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -736,18 +736,18 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
    CUDA generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -777,7 +777,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -890,7 +890,7 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
    CUDA generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -939,7 +939,7 @@ struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct<>,
+struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 8e98deeaf2..f8ec6773c8 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -22,6 +22,7 @@
 
 #if defined(RAJA_CUDA_ACTIVE)
 
+#include <cstddef>
 #include <utility>
 
 #include "RAJA/pattern/reduce.hpp"
@@ -78,6 +79,86 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+/*!
+ * Use the max occupancy of a kernel on the current device when launch
+ * parameters are not fully determined.
+ * Note that the maximum occupancy of the kernel may be less than the maximum
+ * occupancy of the device in terms of total threads.
+ */
+struct MaxOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use a fraction and an offset of the max occupancy of a kernel on the current
+ * device when launch parameters are not fully determined.
+ * The following formula is used, with care to avoid zero, to determine the
+ * maximum grid size:
+ * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
+ */
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+struct FractionOffsetOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    using Fraction = typename t_Fraction::template rebind<IdxT>;
+
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+      func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
+    }
+
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    }
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use an occupancy that is less than the max occupancy of the device when
+ * launch parameters are not fully determined.
+ * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is
+ * below the maximum occupancy of the device.
+ * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
+ * maximum grid size.
+ */
+template < typename AvoidMaxOccupancyConcretizer >
+struct AvoidDeviceMaxThreadOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+      return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+  }
+};
+
 }  // namespace cuda
 
 namespace policy
@@ -100,7 +181,8 @@ struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::cuda,
                        RAJA::Pattern::forall,
@@ -108,9 +190,11 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::cuda> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
+  using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+template <bool Async, int num_threads = named_usage::unspecified,
+          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
 struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
                                 RAJA::Pattern::region,
@@ -119,8 +203,6 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 };
 
 
-
-
 //
 // NOTE: There is no Index set segment iteration policy for CUDA
 //
@@ -882,83 +964,144 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace cuda
 
+// contretizers used in forall, scan, and sort policies
+
+using CudaDefaultAvoidMaxOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>;
+
+template < typename AvoidMaxOccupancyConcretizer >
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>;
+
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+
+using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
+
+using CudaRecForReduceConcretizer = cuda::MaxOccupancyConcretizer;
+
+using CudaDefaultConcretizer = cuda::MaxOccupancyConcretizer;
+
 // policies usable with forall, scan, and sort
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct<>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
-using cuda_exec_occ_calc_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, Async>;
+using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
-using cuda_exec_occ_calc_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, BLOCKS_PER_SM, true>;
+using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
-using cuda_exec_occ_calc_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
-using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, cuda::global_x<BLOCK_SIZE>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer>
+using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer>
+using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_rec_for_reduce_explicit = cuda_exec_occ_calc_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaRecForReduceConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_rec_for_reduce_explicit_async = cuda_exec_occ_calc_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>;
+using cuda_exec_rec_for_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaRecForReduceConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_rec_for_reduce = cuda_exec_occ_calc<BLOCK_SIZE, Async>;
+using cuda_exec_rec_for_reduce = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_rec_for_reduce_async = cuda_exec_occ_calc_async<BLOCK_SIZE>;
+using cuda_exec_rec_for_reduce_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
 
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
@@ -989,7 +1132,7 @@ using policy::cuda::cuda_block_reduce;
 using policy::cuda::cuda_warp_reduce;
 
 using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     cuda::thread_x<RAJA::policy::cuda::WARP_SIZE>>;
 using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer<
@@ -1019,7 +1162,7 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
 // policies usable with kernel and launch
 template < typename ... indexers >
 using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -1037,7 +1180,7 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
 
 template < typename ... indexers >
 using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 5d89844e3c..0a9b0bf305 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -44,6 +44,7 @@ namespace scan
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -52,7 +53,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -96,6 +97,7 @@ inclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -105,7 +107,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -152,6 +154,7 @@ exclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -161,7 +164,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 inclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -206,6 +209,7 @@ inclusive(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           size_t BLOCKS_PER_SM,
           bool Async,
           typename InputIter,
@@ -216,7 +220,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Cuda>
 exclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index 6e6e4c5696..c5a353b704 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,7 +44,9 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -54,7 +56,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -75,13 +77,15 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -143,13 +147,15 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -212,7 +218,9 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -222,7 +230,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     Iter,
     Iter,
     Compare)
@@ -243,13 +251,15 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -260,13 +270,15 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -278,7 +290,8 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -290,7 +303,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -314,7 +327,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -322,7 +336,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -396,7 +410,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -404,7 +419,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -479,7 +494,8 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       concepts::negate<concepts::all_of<
@@ -491,7 +507,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -515,7 +531,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -523,7 +540,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -535,7 +552,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -543,7 +561,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async> p,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index e45d3a6aff..9b8442637b 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -301,203 +301,227 @@ hipDeviceProp_t& device_prop()
 
 struct HipFixedMaxBlocksData
 {
-  int multiProcessorCount;
-  int maxThreadsPerMultiProcessor;
+  int device_sm_per_device;
+  int device_max_threads_per_sm;
 };
 
 RAJA_INLINE
-int hip_max_blocks(int block_size)
+HipFixedMaxBlocksData hip_max_blocks()
 {
-  static HipFixedMaxBlocksData data = []() {
-    hipDeviceProp_t& prop = hip::device_prop();
-    return HipFixedMaxBlocksData{prop.multiProcessorCount,
-                                 prop.maxThreadsPerMultiProcessor};
-  }();
-
-  int max_blocks = data.multiProcessorCount *
-                  (data.maxThreadsPerMultiProcessor / block_size);
+  static thread_local HipFixedMaxBlocksData data {
+      hip::device_prop().multiProcessorCount,
+      hip::device_prop().maxThreadsPerMultiProcessor };
 
-  return max_blocks;
+  return data;
 }
 
 struct HipOccMaxBlocksThreadsData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int max_threads;
+  size_t func_dynamic_shmem_per_block;
+  int func_max_blocks_per_device;
+  int func_max_threads_per_block;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size,
-                                       int &max_blocks, int &max_threads)
+HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksThreadsData data {
+      std::numeric_limits<size_t>::max(),
+      -1,
+      -1 };
 
-  if (data.prev_shmem_size != shmem_size) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.max_blocks, &data.max_threads, func, shmem_size));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
     hipDeviceProp_t& prop = hip::device_prop();
-    data.max_blocks = prop.multiProcessorCount;
-    data.max_threads = 1024;
+    data.func_max_blocks_per_device = prop.multiProcessorCount;
+    data.func_max_threads_per_block = 1024;
 #endif
 
-    data.prev_shmem_size = shmem_size;
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
   }
 
-  max_blocks  = data.max_blocks;
-  max_threads = data.max_threads;
-
+  return data;
 }
 
-struct HipOccMaxBlocksFixedThreadsData
+struct HipOccMaxBlocksData
 {
-  size_t prev_shmem_size;
-  int max_blocks;
-  int multiProcessorCount;
+  size_t func_dynamic_shmem_per_block;
+  int func_threads_per_block;
+  int device_sm_per_device;
+  int device_max_threads_per_sm;
+  int func_max_blocks_per_sm;
 };
 
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks)
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksFixedThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksData data {
+      std::numeric_limits<size_t>::max(),
+      func_threads_per_block,
+      hip::device_prop().multiProcessorCount,
+      hip::device_prop().maxThreadsPerMultiProcessor,
+      -1 };
 
-  if (data.prev_shmem_size != shmem_size) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.max_blocks <= 0) { data.max_blocks = 1 }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
 
-    if (data.multiProcessorCount == uninitialized) {
-
-      data.multiProcessorCount = hip::device_prop().multiProcessorCount;
-
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size = shmem_size;
 
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct HipOccMaxBlocksVariableThreadsData
-{
-  size_t prev_shmem_size;
-  int prev_num_threads;
-  int max_blocks;
-  int multiProcessorCount;
-};
-
-template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func >
+template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
-void hip_occupancy_max_blocks(Func&& func, size_t shmem_size,
-                               int &max_blocks, int num_threads)
+HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
+    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static constexpr int uninitialized = 0;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local HipOccMaxBlocksVariableThreadsData data = {
-      uninitialized_size_t, uninitialized, uninitialized, uninitialized};
+  static thread_local HipOccMaxBlocksData data {
+      std::numeric_limits<size_t>::max(),
+      -1,
+      hip::device_prop().multiProcessorCount,
+      hip::device_prop().maxThreadsPerMultiProcessor,
+      -1 };
+
+  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+       data.func_threads_per_block != func_threads_per_block ) {
 
-  if ( data.prev_shmem_size  != shmem_size ||
-       data.prev_num_threads != num_threads ) {
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.max_blocks, func, num_threads, shmem_size));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.max_blocks <= 0) { data.max_blocks = 1 }
+    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
+    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
 
-    if (data.multiProcessorCount == uninitialized) {
-
-      data.multiProcessorCount = hip::device_prop().multiProcessorCount;
-
-    }
-
-    data.max_blocks *= data.multiProcessorCount;
-
-    data.prev_shmem_size  = shmem_size;
-    data.prev_num_threads = num_threads;
-
   }
 
-  max_blocks = data.max_blocks;
-
+  return data;
 }
 
-struct HipOccupancyDefaults
+/*!
+ ******************************************************************************
+ *
+ * \brief  Hip Concretizer Implementation.
+ *
+ * \tparam IdxT Index type to use for integer calculations.
+ * \tparam Concretizer Class the determines the max number of blocks to use when
+ *         fitting for the device.
+ * \tparam UniqueMarker A type that is unique to each global function, used to
+ *         help cache the occupancy data for that global function.
+ *
+ ******************************************************************************
+ */
+template < typename IdxT, typename Concretizer, typename UniqueMarker>
+struct ConcretizerImpl
 {
-  HipOccupancyDefaults(const void* RAJA_UNUSED_ARG(func))
+  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
+    : m_func(func)
+    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
+    , m_len(len)
   { }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size),
-                                IdxT RAJA_UNUSED_ARG(block_size)) const
+  // Get the maximum block size
+  IdxT get_max_block_size() const
   {
-    return std::numeric_limits<IdxT>::max();
+    auto data = hip_occupancy_max_blocks_threads<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block);
+    IdxT func_max_threads_per_block = data.func_max_threads_per_block;
+    return func_max_threads_per_block;
   }
 
-  template < typename IdxT = hip_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const
+  // Get a block size that combined with the given grid size is large enough
+  // to do len work, or 0 if not possible
+  IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
-    return std::make_pair(static_cast<IdxT>(::RAJA::policy::hip::MAX_BLOCK_SIZE),
-                          std::numeric_limits<IdxT>::max());
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block) {
+      return func_threads_per_block;
+    } else {
+      return IdxT(0);
+    }
   }
-};
 
-template < typename UniqueMarker >
-struct HipOccupancyCalculator
-{
-  HipOccupancyCalculator(const void* func)
-    : m_func(func)
-  { }
+  // Get a grid size that combined with the given block size is large enough
+  // to do len work
+  IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
+  {
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return func_blocks_per_device;
+  }
+
+  // Get a block size and grid size that combined is large enough
+  // to do len work
+  auto get_block_and_grid_size_to_fit_len() const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
+  }
+
+  // Get a block size that combined with the given grid size is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
+  {
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    return std::min(func_threads_per_block, func_max_threads_per_block);
+  }
 
-  template < typename IdxT >
-  inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const
+  // Get a grid size that combined with the given block size is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
-    int max_grid_size = -1;
-    ::RAJA::hip::hip_occupancy_max_blocks<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, block_size);
-    return static_cast<IdxT>(max_grid_size);
+    auto data = hip_occupancy_max_blocks<UniqueMarker>(
+        m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
+    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  template < typename IdxT = hip_dim_member_t >
-  inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const
+  // Get a block size and grid size that combined is the smaller of
+  // the amount need to achieve maximum occupancy on the device or
+  // the amount needed to do len work
+  auto get_block_and_grid_size_to_fit_device() const
   {
-    int max_block_size = -1;
-    int max_grid_size = -1;
-    ::RAJA::hip::hip_occupancy_max_blocks_threads<UniqueMarker>(
-        m_func, dynamic_shmem_size, max_grid_size, max_block_size);
-    return std::make_pair(static_cast<IdxT>(max_block_size),
-                          static_cast<IdxT>(max_grid_size));
+    IdxT func_max_threads_per_block = this->get_max_block_size();
+    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block,
+                          func_blocks_per_device);
   }
 
 private:
   const void* m_func;
+  size_t m_func_dynamic_shmem_per_block;
+  IdxT m_len;
 };
 
 }  // namespace hip
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index 2f9830bb31..6fa21f9217 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -56,57 +56,6 @@ namespace hip
 namespace impl
 {
 
-/*!
- ******************************************************************************
- *
- * \brief  Hip grid dimension helper for strided loops template.
- *
- * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid
- *         stride loop to run multiple iterates in a single thread.
- *
- ******************************************************************************
- */
-template<typename IterationMapping>
-struct GridStrideHelper;
-
-/// handle direct policies with no modifiers
-template<>
-struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size))
-  {
-    return normal_grid_size;
-  }
-};
-
-/// handle strided loop policies with no modifiers
-template<>
-struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
-    named_usage::unspecified>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
-  {
-    return std::min(normal_grid_size, max_grid_size);
-  }
-};
-
-/// handle strided loop policies with multiplier on iterates per thread
-template<typename FractionIdxT, FractionIdxT numerator, FractionIdxT demoninator>
-struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
-    named_usage::unspecified, Fraction<FractionIdxT, numerator, demoninator>>>
-{
-  template < typename IdxT >
-  static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size)
-  {
-    // use inverse multiplier on max grid size to affect number of threads
-    using Frac = typename Fraction<IdxT, IdxT(numerator), IdxT(demoninator)>::inverse;
-    max_grid_size = Frac::multiply(max_grid_size);
-    return std::min(normal_grid_size, max_grid_size);
-  }
-};
-
 /*!
  ******************************************************************************
  *
@@ -122,21 +71,21 @@ struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop<
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename UniqueMarker>
+template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
@@ -144,8 +93,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifi
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    if ( len > (static_cast<IdxT>(IndexGetter::block_size) *
-                static_cast<IdxT>(IndexGetter::grid_size)) ) {
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    if ( len > (block_size * grid_size) ) {
       RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
     }
 
@@ -154,160 +105,168 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifi
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    internal::set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::grid_size)));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
+
+    if ( block_size == IdxT(0) ) {
+      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    }
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+                             const void* func, size_t dynamic_shmem_size)
   {
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexGetter::block_size)));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct<MappingModifiers...>,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
-
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
+
+    internal::set_hip_dim<dim>(dims.threads, sizes.first);
+    internal::set_hip_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
                              const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-  static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration");
 
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_block_size = std::min(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)),
-        static_cast<IdxT>(max_sizes.first));
+    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
-    internal::set_hip_dim<dim>(dims.threads, calculated_block_size);
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
   static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
 
-  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size,
-                                              static_cast<IdxT>(IndexMapper::block_size));
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)),
-        static_cast<IdxT>(max_grid_size));
+    const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
+    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_hip_dim<dim>(dims.threads, block_size);
+    internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, typename UniqueMarker, typename ... MappingModifiers>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>,
+template<named_dim dim, typename Concretizer, typename UniqueMarker>
+struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
                                  ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+                                 Concretizer,
                                  UniqueMarker>
 {
-  using IterationMapping = ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified, MappingModifiers...>;
-  using IndexMapper = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
   template < typename IdxT >
   static void set_dimensions(internal::HipDims& dims, IdxT len,
                              const void* func, size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::HipOccupancyCalculator<UniqueMarker> oc(func);
-    auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size);
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
 
-    IdxT calculated_grid_size = GridStrideHelper<IterationMapping>::get_grid_size(
-        RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(max_sizes.first)),
-        static_cast<IdxT>(max_sizes.second));
+    const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(max_sizes.first));
-    internal::set_hip_dim<dim>(dims.blocks, calculated_grid_size);
+    internal::set_hip_dim<dim>(dims.threads, sizes.first);
+    internal::set_hip_dim<dim>(dims.blocks, sizes.second);
   }
 };
 
@@ -551,7 +510,7 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -559,7 +518,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>const&,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
@@ -567,9 +526,9 @@ forall_impl(resources::Hip hip_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -620,7 +579,7 @@ forall_impl(resources::Hip hip_res,
 
 template <typename Iterable, typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename ForallParam>
 RAJA_INLINE 
 concepts::enable_if_t<
@@ -628,7 +587,7 @@ concepts::enable_if_t<
   RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
   concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
 forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> const&,
+            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
@@ -636,9 +595,9 @@ forall_impl(resources::Hip hip_res,
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
   using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>;
+  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
   using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, UniqueMarker>;
+  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
@@ -716,11 +675,11 @@ forall_impl(resources::Hip hip_res,
  */
 template <typename LoopBody,
           typename IterationMapping, typename IterationGetter,
-          bool Async,
+          typename Concretizer, bool Async,
           typename... SegmentTypes>
 RAJA_INLINE resources::EventProxy<resources::Hip>
 forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>,
+            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
             const TypedIndexSet<SegmentTypes...>& iset,
             LoopBody&& loop_body)
 {
@@ -729,7 +688,7 @@ forall_impl(resources::Hip r,
     iset.segmentCall(r,
                      isi,
                      detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, true>(),
+                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp
index 678d48e3c1..4f907f5f5f 100644
--- a/include/RAJA/policy/hip/kernel.hpp
+++ b/include/RAJA/policy/hip/kernel.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   RAJA header file containing constructs used to run kernel::forall
- *          traversals on GPU with CUDA.
+ *          traversals on GPU with HIP.
  *
  ******************************************************************************
  */
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 10563bc20e..848ea42edf 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -45,7 +45,7 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                    EnclosedStmts...>,
     Types> {
 
@@ -60,7 +60,7 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
+      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index be7e256274..014b4db3ac 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -47,20 +47,20 @@ template <typename Data,
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                          EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
         Data,
         statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                        EnclosedStmts...>,
         Types> {
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 67bea1299a..68156600b2 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -87,7 +87,7 @@ namespace statement
  */
 template <typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, true>, EnclosedStmts...> {
+    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
 };
 
 
@@ -263,7 +263,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   inline static void recommended_blocks_threads(size_t shmem_size,
       int &recommended_blocks, int &recommended_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -273,8 +273,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         // determine blocks at runtime
         // determine threads at runtime
         //
-        ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
-            func, shmem_size, recommended_blocks, recommended_threads);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_threads = data.func_max_threads_per_block;
 
       } else {
 
@@ -284,8 +286,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         recommended_threads = num_threads;
 
-        ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, recommended_blocks);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
@@ -339,7 +342,7 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   inline static void max_blocks(size_t shmem_size,
       int &max_blocks, int actual_threads)
   {
-    auto func = kernelGetter_t::get();
+    auto func = reinterpret_cast<const void*>(kernelGetter_t::get());
 
     if (num_blocks <= 0) {
 
@@ -352,16 +355,18 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         // determine blocks when actual_threads != num_threads
         //
-        ::RAJA::hip::hip_occupancy_max_blocks<Self>(
-            func, shmem_size, max_blocks, actual_threads);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
+            func, shmem_size, actual_threads);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       } else {
 
         //
         // determine blocks when actual_threads == num_threads
         //
-        ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
-            func, shmem_size, max_blocks);
+        auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
+            func, shmem_size);
+        max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
 
       }
 
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 51a199226f..62dda7f20d 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -58,7 +58,7 @@ struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
   {
@@ -69,7 +69,7 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
   static inline RAJA_DEVICE
   void exec(Data &data, bool thread_active)
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 72e4114a23..07637fbd8f 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -60,14 +60,14 @@ struct HipStatementExecutor<
     Data,
     statement::TileTCount<ArgumentId, ParamId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                     EnclosedStmts...>,
                     Types>
     : public HipStatementExecutor<
         Data,
         statement::Tile<ArgumentId,
                         RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                         EnclosedStmts...>,
                         Types> {
 
@@ -75,7 +75,7 @@ struct HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
                       EnclosedStmts...>,
                       Types>;
 
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index 1c520d4af9..aa0610d736 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -217,7 +217,7 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
@@ -234,7 +234,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
@@ -250,7 +250,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
@@ -271,7 +271,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
@@ -286,7 +286,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
@@ -307,7 +307,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
@@ -323,7 +323,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
@@ -343,7 +343,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
@@ -362,7 +362,7 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
 };
 ///
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct<>,
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
                                                     sync,
                                                     hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 8f605cb538..76f592d20b 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -348,7 +348,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
    HIP generic loop implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -371,7 +371,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1>,
@@ -399,7 +399,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper0,
                                                   IndexMapper1,
@@ -538,7 +538,7 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
@@ -560,7 +560,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
   }
 };
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1>,
@@ -590,7 +590,7 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
 };
 
 template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper0,
                                                         IndexMapper1,
@@ -736,18 +736,18 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
    HIP generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           sync,
                                                           IndexMapper0>,
                    SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   sync,
                                                   IndexMapper0>,
                    SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1>,
@@ -777,7 +777,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 };
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct<>,
+struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
                                                           kernel_sync_requirement::none,
                                                           IndexMapper0,
                                                           IndexMapper1,
@@ -890,7 +890,7 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
    HIP generic tile implementations
 */
 template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                   kernel_sync_requirement::none,
                                                   IndexMapper>,
                    SEGMENT> {
@@ -939,7 +939,7 @@ struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
 };
 
 template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct<>,
+struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
                                                         kernel_sync_requirement::none,
                                                         IndexMapper>,
                          SEGMENT> {
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 49cd489be4..3ff0dd553f 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -74,6 +74,86 @@ struct IndexGlobal;
 template<typename ...indexers>
 struct IndexFlatten;
 
+/*!
+ * Use the max occupancy of a kernel on the current device when launch
+ * parameters are not fully determined.
+ * Note that the maximum occupancy of the kernel may be less than the maximum
+ * occupancy of the device in terms of total threads.
+ */
+struct MaxOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use a fraction and an offset of the max occupancy of a kernel on the current
+ * device when launch parameters are not fully determined.
+ * The following formula is used, with care to avoid zero, to determine the
+ * maximum grid size:
+ * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
+ */
+template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+struct FractionOffsetOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    using Fraction = typename t_Fraction::template rebind<IdxT>;
+
+    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+      func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
+    }
+
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
+      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    }
+
+    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+
+    return func_max_blocks_per_device;
+  }
+};
+
+/*!
+ * Use an occupancy that is less than the max occupancy of the device when
+ * launch parameters are not fully determined.
+ * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is
+ * below the maximum occupancy of the device.
+ * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
+ * maximum grid size.
+ */
+template < typename AvoidMaxOccupancyConcretizer >
+struct AvoidDeviceMaxThreadOccupancyConcretizer
+{
+  template < typename IdxT, typename Data >
+  static IdxT get_max_grid_size(Data const& data)
+  {
+    IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
+    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block = data.func_threads_per_block;
+
+    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+
+    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+      return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    } else {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+  }
+};
+
 }  // namespace hip
 
 namespace policy
@@ -93,7 +173,8 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, bool Async = false>
+template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
+          bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::hip,
                        RAJA::Pattern::forall,
@@ -101,6 +182,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Platform::hip> {
   using IterationMapping = _IterationMapping;
   using IterationGetter = _IterationGetter;
+  using LaunchConcretizer = _LaunchConcretizer;
 };
 
 template <bool Async, int num_threads = named_usage::unspecified>
@@ -816,6 +898,7 @@ struct IndexFlatten<x_index, y_index, z_index>
 
 };
 
+
 // helper to get just the thread indexing part of IndexGlobal
 template < typename index_global >
 struct get_index_thread;
@@ -876,44 +959,83 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 } // namespace hip
 
+// contretizers used in forall, scan, and sort policies
+
+using HipDefaultAvoidMaxOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>;
+
+template < typename AvoidMaxOccupancyConcretizer >
+using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>;
+
+template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+
+using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
+
+using HipRecForReduceConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<HipDefaultAvoidMaxOccupancyConcretizer>;
+
+using HipDefaultConcretizer = hip::MaxOccupancyConcretizer;
+
 // policies usable with forall, scan, and sort
+
 template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct<>, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct<>, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, true>;
 
 template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
-using hip_exec_occ_calc_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, Async>;
+using hip_exec_occ_fraction = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
 
 template <size_t BLOCK_SIZE, typename Fraction>
-using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified, typename Fraction::inverse>, hip::global_x<BLOCK_SIZE>, true>;
+using hip_exec_occ_fraction_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+
+template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = HipDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+using hip_exec_occ_avoid_max = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, Async>;
+
+template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = HipDefaultAvoidMaxOccupancyConcretizer>
+using hip_exec_occ_avoid_max_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_rec_for_reduce = hip_exec_occ_calc_fraction<BLOCK_SIZE, Fraction<size_t, 1, 2>, Async>;
+using hip_exec_rec_for_reduce = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipRecForReduceConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_rec_for_reduce_async = hip_exec_occ_calc_fraction_async<BLOCK_SIZE, Fraction<size_t, 1, 2>>;
+using hip_exec_rec_for_reduce_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipRecForReduceConcretizer, true>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
@@ -937,7 +1059,7 @@ using policy::hip::hip_block_reduce;
 using policy::hip::hip_warp_reduce;
 
 using hip_warp_direct = RAJA::policy::hip::hip_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     hip::thread_x<RAJA::policy::hip::WARP_SIZE>>;
 using hip_warp_loop = RAJA::policy::hip::hip_indexer<
@@ -961,7 +1083,7 @@ using policy::hip::hip_launch_t;
 // policies usable with kernel and launch
 template < typename ... indexers >
 using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     indexers...>;
 
@@ -979,7 +1101,7 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer<
 
 template < typename ... indexers >
 using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct<>,
+    iteration_mapping::Direct,
     kernel_sync_requirement::none,
     indexers...>;
 
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index 40e44c2e19..cdf0a9b82d 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -49,6 +49,7 @@ namespace scan
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename Function>
@@ -56,7 +57,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -121,6 +122,7 @@ inclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename Function,
@@ -129,7 +131,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -198,6 +200,7 @@ exclusive_inplace(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -206,7 +209,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 inclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -271,6 +274,7 @@ inclusive(
 */
 template <typename IterationMapping,
           typename IterationGetter,
+          typename Concretizer,
           bool Async,
           typename InputIter,
           typename OutputIter,
@@ -280,7 +284,7 @@ RAJA_INLINE
 resources::EventProxy<resources::Hip>
 exclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index a6918968c8..eb16246623 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -73,7 +73,9 @@ namespace detail
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -83,7 +85,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter,
     Iter,
     Compare)
@@ -102,13 +104,15 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>>)
@@ -190,13 +194,15 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 stable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>>)
@@ -279,7 +285,9 @@ stable(
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter, typename Compare>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
                         type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
@@ -289,7 +297,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     Iter,
     Iter,
     Compare)
@@ -308,13 +316,15 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     Iter begin,
     Iter end,
     operators::less<RAJA::detail::IterVal<Iter>> comp)
@@ -325,13 +335,15 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter, bool Async, typename Iter>
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
+          typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
 unstable(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     Iter begin,
     Iter end,
     operators::greater<RAJA::detail::IterVal<Iter>> comp)
@@ -343,7 +355,8 @@ unstable(
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -355,7 +368,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -379,7 +392,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -387,7 +401,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -483,7 +497,8 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -491,7 +506,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -588,7 +603,8 @@ stable_pairs(
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       concepts::negate<concepts::all_of<
@@ -600,7 +616,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                           camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
@@ -624,7 +640,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -632,7 +649,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -644,7 +661,8 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter, bool Async,
+template <typename IterationMapping, typename IterationGetter,
+          typename Concretizer, bool Async,
           typename KeyIter, typename ValIter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
@@ -652,7 +670,7 @@ concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async> p,
+    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index a54ce434a2..28a476d951 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -65,8 +65,9 @@ namespace RAJA
     using type = camp::resources::Cuda;
   };
 
-  template<typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>>{
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
     using type = camp::resources::Cuda;
   };
 
@@ -75,8 +76,9 @@ namespace RAJA
     using type = camp::resources::Cuda;
   };
 
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, BLOCKS_PER_SM, Async>>>{
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
     using type = camp::resources::Cuda;
   };
 #endif
@@ -87,8 +89,9 @@ namespace RAJA
     using type = camp::resources::Hip;
   };
 
-  template<typename IterationMapping, typename IterationGetter, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>{
+  template<typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
     using type = camp::resources::Hip;
   };
 
@@ -97,8 +100,9 @@ namespace RAJA
     using type = camp::resources::Hip;
   };
 
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Async>>>{
+  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
+           typename Concretizer, bool Async>
+  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
     using type = camp::resources::Hip;
   };
 #endif
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 95b139bce5..011082953d 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -100,7 +100,6 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-template < typename ... Modifiers >
 struct Direct : DirectBase {};
 
 ///
@@ -128,7 +127,7 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations, typename ... Modifiers >
+template < size_t max_iterations >
 struct Contiguousloop : ContiguousLoopBase,
     std::conditional_t<(max_iterations != named_usage::unspecified),
                        SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
@@ -158,7 +157,7 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations, typename ... Modifiers >
+template < size_t max_iterations >
 struct StridedLoop : StridedLoopBase,
     std::conditional_t<(max_iterations != named_usage::unspecified),
                        SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
@@ -201,6 +200,9 @@ struct Fraction
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
+  template < typename new_int_t >
+  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index 458e6d06d0..f09ff71182 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -109,7 +109,8 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
                                        RAJA::cuda_exec_occ_calc<256>,
                                        RAJA::cuda_exec_grid<256, 64>,
                                        RAJA::cuda_exec_explicit<256,2>,
-                                       RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction<size_t,1,2>> >;
+                                       RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                       RAJA::cuda_exec_occ_avoid_max<256> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -121,7 +122,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols;
 using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
                                       RAJA::hip_exec_occ_calc<256>,
                                       RAJA::hip_exec_grid<256, 64>,
-                                      RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction<size_t,1,2>> >;
+                                      RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
+                                      RAJA::hip_exec_occ_avoid_max<256> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From f0bdae976c32aebc6ac7fae3fd54e2032cb02a5f Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 14:09:34 -0700
Subject: [PATCH 025/108] Simplify cuda_exec_occ_avoid_max

---
 include/RAJA/policy/cuda/policy.hpp | 25 +++++++++++--------------
 include/RAJA/policy/hip/policy.hpp  | 17 +++++++----------
 2 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index f8ec6773c8..20c0a7a4de 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -966,19 +966,16 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaDefaultAvoidMaxOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>;
-
-template < typename AvoidMaxOccupancyConcretizer >
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
 template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
 using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
-using CudaRecForReduceConcretizer = cuda::MaxOccupancyConcretizer;
+using CudaRecForReduceConcretizer = CudaMaxOccupancyConcretizer;
 
-using CudaDefaultConcretizer = cuda::MaxOccupancyConcretizer;
+using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
@@ -1062,25 +1059,25 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, BLOCKS_PER_SM, Async>;
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer>
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, BLOCKS_PER_SM, true>;
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>;
 
-template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = CudaDefaultAvoidMaxOccupancyConcretizer>
+template <size_t BLOCK_SIZE>
 using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit<
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 3ff0dd553f..aa356c132f 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -961,19 +961,16 @@ using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 // contretizers used in forall, scan, and sort policies
 
-using HipDefaultAvoidMaxOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>;
-
-template < typename AvoidMaxOccupancyConcretizer >
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
 template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
 using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipRecForReduceConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<HipDefaultAvoidMaxOccupancyConcretizer>;
+using HipRecForReduceConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
-using HipDefaultConcretizer = hip::MaxOccupancyConcretizer;
+using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
@@ -1017,15 +1014,15 @@ using hip_exec_occ_fraction_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
 
-template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = HipDefaultAvoidMaxOccupancyConcretizer, bool Async = false>
+template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_avoid_max = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, Async>;
+    HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>;
 
-template <size_t BLOCK_SIZE, typename AvoidMaxOccupancyConcretizer = HipDefaultAvoidMaxOccupancyConcretizer>
+template <size_t BLOCK_SIZE>
 using hip_exec_occ_avoid_max_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipAvoidDeviceMaxThreadOccupancyConcretizer<AvoidMaxOccupancyConcretizer>, true>;
+    HipAvoidDeviceMaxThreadOccupancyConcretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_rec_for_reduce = policy::hip::hip_exec<

From c44e06bed5439194d060d104f7e4e57c22422dc1 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 14:50:41 -0700
Subject: [PATCH 026/108] Use 1/2 occupancy as HipRecForReduceConcretizer

---
 include/RAJA/policy/hip/policy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index aa356c132f..304fa55c32 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -968,7 +968,7 @@ using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcre
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipRecForReduceConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
+using HipRecForReduceConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 

From 69e8bd70fd776b172e91fc0041ea9d1ab75d5124 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 15:06:36 -0700
Subject: [PATCH 027/108] CHanges to occ policies

occ_calc now uses the default (may not be max)
occ_max added to use max
occ_custom added for using whatever concretizer you'd like
occ_avoid_max removed
---
 include/RAJA/policy/cuda/policy.hpp | 50 ++++++++++++++++++++---------
 include/RAJA/policy/hip/policy.hpp  | 24 ++++++++++----
 2 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 20c0a7a4de..e7a72b2be7 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1022,20 +1022,40 @@ using cuda_exec_async = policy::cuda::cuda_exec_explicit<
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+
+template <size_t BLOCK_SIZE>
+using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
@@ -1059,25 +1079,25 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    Concretizer, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit<
+template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
+using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    Concretizer, BLOCKS_PER_SM, true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit<
+template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE>
-using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit<
+template <size_t BLOCK_SIZE, typename Concretizer>
+using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit<
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 304fa55c32..65c87ff203 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -997,10 +997,20 @@ using hip_exec_async = policy::hip::hip_exec<
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    HipDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer, true>;
+
+template <size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_occ_max = policy::hip::hip_exec<
+    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer, Async>;
+
+template <size_t BLOCK_SIZE>
+using hip_exec_occ_max_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipMaxOccupancyConcretizer, true>;
 
@@ -1014,15 +1024,15 @@ using hip_exec_occ_fraction_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_occ_avoid_max = policy::hip::hip_exec<
+template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+using hip_exec_occ_custom = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>;
+    Concretizer, Async>;
 
-template <size_t BLOCK_SIZE>
-using hip_exec_occ_avoid_max_async = policy::hip::hip_exec<
+template <size_t BLOCK_SIZE, typename Concretizer>
+using hip_exec_occ_custom_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipAvoidDeviceMaxThreadOccupancyConcretizer, true>;
+    Concretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_rec_for_reduce = policy::hip::hip_exec<

From b58c675461aa7e0cd3459d4ed91cc3b3fec2649d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 15:42:04 -0700
Subject: [PATCH 028/108] Add simple cook book with reduction example

---
 docs/sphinx/user_guide/cook_book.rst          | 24 ++++++
 .../sphinx/user_guide/cook_book/reduction.rst | 78 +++++++++++++++++++
 docs/sphinx/user_guide/feature/reduction.rst  |  4 +
 docs/sphinx/user_guide/index.rst              |  1 +
 4 files changed, 107 insertions(+)
 create mode 100644 docs/sphinx/user_guide/cook_book.rst
 create mode 100644 docs/sphinx/user_guide/cook_book/reduction.rst

diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst
new file mode 100644
index 0000000000..44c89c3d51
--- /dev/null
+++ b/docs/sphinx/user_guide/cook_book.rst
@@ -0,0 +1,24 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _cook-book-label:
+
+************************
+RAJA Cook Book
+************************
+
+The following sections show common use case patterns and the recommended
+RAJA features and policies to use with them. They are intended
+for users to copy and paste into their code and provide guidance on
+which policy to use with each backend to get good performance.
+
+.. toctree::
+   :maxdepth: 2
+
+   cook_book/reduction
+
diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
new file mode 100644
index 0000000000..309561bc38
--- /dev/null
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -0,0 +1,78 @@
+.. ##
+.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+.. ## and other RAJA project contributors. See the RAJA/LICENSE file
+.. ## for details.
+.. ##
+.. ## SPDX-License-Identifier: (BSD-3-Clause)
+.. ##
+
+.. _cook-book-reductions-label:
+
+=======================
+Cooking with Reductions
+=======================
+
+Please see the following section for more info on RAJA reductions:
+
+ * :ref:`feat-reductions-label`.
+
+
+----------------------------
+Reductions with RAJA::forall
+----------------------------
+
+Here is the setup for a simple reduction example::
+
+  const int N = 1000;
+
+  int vec[N];
+
+  for (int i = 0; i < N; ++i) {
+
+    vec[i] = 1;
+
+  }
+
+Here a simple sum reduction is performed in a for loop::
+
+  int vsum = 0;
+
+  // Run a kernel using the reduction objects
+  for (int i = 0; i < N; ++i) {
+
+    vsum += vec[i];
+
+  }
+
+The results of these operations will yield the following values:
+
+ * vsum == 1000
+
+Here a simple sum reduction is performed using RAJA::
+
+  using reduce_policy = RAJA::seq_reduce;
+  // using reduce_policy = RAJA::omp_reduce;
+  // using reduce_policy = RAJA::omp_target_reduce;
+  // using reduce_policy = RAJA::cuda_reduce;
+  // using reduce_policy = RAJA::hip_reduce;
+  // using reduce_policy = RAJA::sycl_reduce;
+
+  using exec_policy = RAJA::seq_exec;
+  // using exec_policy = RAJA::omp_parallel_for_exec;
+  // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
+  // using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>;
+  // using exec_policy = RAJA::sycl_exec<256>;
+
+  RAJA::ReduceSum<reduce_policy, int> vsum(0);
+
+  RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),
+    [=](RAJA::Index_type i) {
+
+    vsum += vec[i];
+
+  });
+
+The results of these operations will yield the following values:
+
+ * vsum.get() == 1000
diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst
index 8643e4a225..5f2f09afad 100644
--- a/docs/sphinx/user_guide/feature/reduction.rst
+++ b/docs/sphinx/user_guide/feature/reduction.rst
@@ -39,6 +39,10 @@ RAJA reductions:
 
  * :ref:`tut-reduction-label`.
 
+Please see the following cook book sections for guidance on policy usage:
+
+ * :ref:`cook-book-reductions-label`.
+
 
 ----------------
 Reduction Types
diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst
index f2fb6ca46d..f73f4d9449 100644
--- a/docs/sphinx/user_guide/index.rst
+++ b/docs/sphinx/user_guide/index.rst
@@ -32,5 +32,6 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`.
    using_raja
    config_options
    features
+   cook_book
    app_considerations
    tutorial

From fed1838d851f3fe39b9fe3e7d33a499e0c1e184a Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 5 Apr 2024 16:30:13 -0700
Subject: [PATCH 029/108] Add user guide documentation of the new policies

---
 docs/sphinx/user_guide/feature/policies.rst | 55 ++++++++++++++++++---
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index d3f982951a..ad1196237d 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -257,7 +257,7 @@ policies have the prefix ``hip_``.
                                                          Note that the thread-block
                                                          size and grid size must be
                                                          provided, there is no default.
- cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Execute loop iterations
+ cuda/hip_exec_occ_max<BLOCK_SIZE>         forall        Execute loop iterations
                                                          mapped to global threads via
                                                          grid striding with multiple
                                                          iterations per global thread
@@ -265,12 +265,20 @@ policies have the prefix ``hip_``.
                                                          with given thread-block
                                                          size and grid size bounded
                                                          by the maximum occupancy of
-                                                         the kernel. Note that the
-                                                         thread-block size must
-                                                         be provided, there is no
-                                                         default. Note this can improve
-                                                         reducer performance in kernels
-                                                         with large iteration counts.
+                                                         the kernel.
+ cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Similar to the occ_max
+                                                         policy but may use less
+                                                         than the maximum occupancy
+                                                         of the kernel for performance
+                                                         reasons.
+ cuda/hip_exec_occ_fraction<BLOCK_SIZE,    forall        Similar to the occ_max
+     RAJA::Fraction<size_t,                              policy but use a fraction
+        numerator, denominator>>                         of the maximum occupancy
+                                                         of the kernel.
+ cuda/hip_exec_occ_custom<BLOCK_SIZE,      forall        Similar to the occ_max
+     Concretizer>                                        policy but the grid size
+                                                         is determined by the
+                                                         concretizer.
  cuda/hip_exec_rec_for_reduce<BLOCK_SIZE>  forall        The cuda/hip exec policy
                                                          that is recommended for
                                                          use with reducers.
@@ -414,6 +422,39 @@ policies have the prefix ``hip_``.
                                                          thread warp.
  ========================================= ============= =======================================
 
+When a cuda/hip policy leaves parameters like the block size and/or grid size
+unspecified a concretizer object is used to decide those parameters. The
+following concretizers are available to use in the cuda/hip_exec_occ_custom
+policies:
+
+=================================================== =========================================
+Execution Policy                                    Brief description
+=================================================== =========================================
+
+Cuda/HipDefaultConcretizer                          The default concretizer, expected to
+                                                    provide good performance in general.
+                                                    Note that it may not use max occupancy.
+
+Cuda/HipRecForReduceConcretizer                     Expected to provide good performance
+                                                    in loops with reducers.
+                                                    Note that it may not use max occupancy.
+
+Cuda/HipMaxOccupancyConcretizer                     Uses max occupancy.
+
+Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer    Avoids using the max occupancy of the
+                                                    device in terms of threads.
+                                                    Note that it may use the max occupancy
+                                                    of the function if that is below the max
+                                                    occupancy of the device.
+
+Cuda/HipFractionOffsetOccupancyConcretizer<         Uses a fraction and offset to choose an
+    Fraction<size_t, numerator, denomenator>,       occupancy based on the max occupancy
+    BLOCKS_PER_SM_OFFSET>                           Using the following formula.
+                                                    (Fraction * kernel_max_blocks_per_sm +
+                                                     BLOCKS_PER_SM_OFFSET) * sm_per_device
+
+=================================================== =========================================
+
 Several notable constraints apply to RAJA CUDA/HIP *direct* policies.
 
 .. note:: * Repeating direct policies with the same dimension in perfectly

From 9e698bbe553d5560762db439b1573225e6d28115 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sat, 6 Apr 2024 14:22:17 -0700
Subject: [PATCH 030/108] Change policy in tests

---
 test/include/RAJA_test-forall-execpol.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp
index f09ff71182..40adaccc8c 100644
--- a/test/include/RAJA_test-forall-execpol.hpp
+++ b/test/include/RAJA_test-forall-execpol.hpp
@@ -110,7 +110,7 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>,
                                        RAJA::cuda_exec_grid<256, 64>,
                                        RAJA::cuda_exec_explicit<256,2>,
                                        RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                       RAJA::cuda_exec_occ_avoid_max<256> >;
+                                       RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using CudaForallReduceExecPols = CudaForallExecPols;
 
@@ -123,7 +123,7 @@ using HipForallExecPols = camp::list< RAJA::hip_exec<128>,
                                       RAJA::hip_exec_occ_calc<256>,
                                       RAJA::hip_exec_grid<256, 64>,
                                       RAJA::hip_exec_occ_fraction<256, RAJA::Fraction<size_t,1,2>>,
-                                      RAJA::hip_exec_occ_avoid_max<256> >;
+                                      RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >;
 
 using HipForallReduceExecPols = HipForallExecPols;
 

From aae648ccce6c3edd53700d83e51a0b85cb3674ed Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:08:29 +0200
Subject: [PATCH 031/108] From RSC: add CARE

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index b09f869f9d..078498cdfc 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit b09f869f9d9aff6ecf6544a0161d96c2b18d13b8
+Subproject commit 078498cdfcc5b6024ff44964d4032a5ad5793a2f

From 08f1ee0fd08ccbd15d7036b7b7f8b61827afbb36 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Thu, 11 Apr 2024 19:01:37 +0200
Subject: [PATCH 032/108] From RSC: Fix merge with CARE package

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 078498cdfc..5dfa405e08 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 078498cdfcc5b6024ff44964d4032a5ad5793a2f
+Subproject commit 5dfa405e0883e5177ee96d4995cd57be4b254d8f

From b865799f7756878d18a002c1cc3b7816efaeeb67 Mon Sep 17 00:00:00 2001
From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com>
Date: Mon, 15 Apr 2024 11:55:49 +0200
Subject: [PATCH 033/108] Point at RADIUSS Spack Configs @ main

---
 scripts/radiuss-spack-configs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
index 5dfa405e08..a8d22367e0 160000
--- a/scripts/radiuss-spack-configs
+++ b/scripts/radiuss-spack-configs
@@ -1 +1 @@
-Subproject commit 5dfa405e0883e5177ee96d4995cd57be4b254d8f
+Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8

From db437d23011957565a22d58deaa64f4a1717738c Mon Sep 17 00:00:00 2001
From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com>
Date: Mon, 15 Apr 2024 17:47:40 +0200
Subject: [PATCH 034/108] Update .gitlab/custom-jobs-and-variables.yml

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index f652bb2caf..da32e89a77 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -49,7 +49,7 @@ variables:
 # Arguments for job level allocation
   TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
-  PROJECT_TIOGA_VARIANTS: "~shared ~openmp +vectorization +tests"
+  PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for corona
   PROJECT_TIOGA_DEPS: "^blt@develop "
 

From e7fd18cfea5467f27ada643cdb7cced1e0df3937 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <artv3.89@gmail.com>
Date: Mon, 15 Apr 2024 09:42:35 -0700
Subject: [PATCH 035/108] bug fixes for the bump style allocator

---
 include/RAJA/pattern/launch/launch_core.hpp          |  8 ++++++--
 .../shared_mem/tests/test-launch-DynamicMem.hpp      | 12 ++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 6f56f4ed65..4a2f6c222a 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -174,10 +174,14 @@ class LaunchContext
   template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
-    T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
+
+    //Calculate offset in bytes with a char pointer
+    char* mem_ptr = (char*) shared_mem_ptr + shared_mem_offset;
 
     shared_mem_offset += bytes*sizeof(T);
-    return mem_ptr;
+
+    //convert to desired type
+    return (T *) mem_ptr;
   }
 
   /*
diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
index cdb8940256..8da7b81eb7 100644
--- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
+++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp
@@ -36,12 +36,16 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
   for(s_type b=0; b<RAJA::stripIndexType(block_range); ++b) {
     for(s_type c=0; c<RAJA::stripIndexType(thread_range); ++c) {
       s_type idx = c + RAJA::stripIndexType(thread_range)*b;
-      test_array[idx] = INDEX_TYPE(idx);
+      test_array[idx] = INDEX_TYPE(idx) + INDEX_TYPE(c);
     }
   }
 
   size_t shared_mem_size = RAJA::stripIndexType(thread_range)*sizeof(INDEX_TYPE);
 
+  //Use an int type to test the bump style allocator.
+  //Key idea is that we are requesting different amounts.
+  shared_mem_size += RAJA::stripIndexType(thread_range)*sizeof(int);
+
   RAJA::launch<LAUNCH_POLICY>
     (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)),
                         RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size),
@@ -52,7 +56,11 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
           INDEX_TYPE * tile_ptr = ctx.getSharedMemory<INDEX_TYPE>(RAJA::stripIndexType(thread_range));
           RAJA::View<INDEX_TYPE, RAJA::Layout<1>> Tile(tile_ptr, RAJA::stripIndexType(thread_range));
 
+          int * int_tile_ptr = ctx.getSharedMemory<int>(RAJA::stripIndexType(thread_range));
+          RAJA::View<int, RAJA::Layout<1>> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range));
+
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
+              Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid);
               Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid;
             });
 
@@ -60,7 +68,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range)
 
           RAJA::loop<THREAD_POLICY>(ctx, inner_range, [&](INDEX_TYPE tid) {
               INDEX_TYPE idx = tid + thread_range * bid;
-              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid));
+              working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid));
           });
 
           ctx.releaseSharedMemory();

From dd7b78f267c2495551075fb6bb56fd77d1085c4d Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 15 Apr 2024 12:51:50 -0700
Subject: [PATCH 036/108] Update include/RAJA/pattern/launch/launch_core.hpp

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 include/RAJA/pattern/launch/launch_core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 4a2f6c222a..f03ad1e075 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -176,7 +176,7 @@ class LaunchContext
   {
 
     //Calculate offset in bytes with a char pointer
-    char* mem_ptr = (char*) shared_mem_ptr + shared_mem_offset;
+    char* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
 
     shared_mem_offset += bytes*sizeof(T);
 

From bb0eaa5c8be3dfefaf8ca849ff61a6c548c76639 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 15 Apr 2024 12:51:54 -0700
Subject: [PATCH 037/108] Update include/RAJA/pattern/launch/launch_core.hpp

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 include/RAJA/pattern/launch/launch_core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index f03ad1e075..727f9b064c 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -181,7 +181,7 @@ class LaunchContext
     shared_mem_offset += bytes*sizeof(T);
 
     //convert to desired type
-    return (T *) mem_ptr;
+    return static_cast<T*>(mem_ptr);
   }
 
   /*

From 4b846d6a0fd2e7914c5fd650237b8eb119fe95f3 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <artv3.89@gmail.com>
Date: Mon, 15 Apr 2024 13:21:16 -0700
Subject: [PATCH 038/108] char -> void

---
 include/RAJA/pattern/launch/launch_core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 727f9b064c..213c435236 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -176,7 +176,7 @@ class LaunchContext
   {
 
     //Calculate offset in bytes with a char pointer
-    char* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
+    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
 
     shared_mem_offset += bytes*sizeof(T);
 

From 53e6feb6bbc1a0a76384d147294da66f410cf3b7 Mon Sep 17 00:00:00 2001
From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:35:05 +0200
Subject: [PATCH 039/108] Use new pci queue on tioga

---
 .gitlab/custom-jobs-and-variables.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index da32e89a77..b869af6f50 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -45,9 +45,9 @@ variables:
 
 # Tioga
 # Arguments for top level allocation
-  TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2"
+  TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
-  TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
+  TIOGA_JOB_ALLOC: "--queue=pci --nodes=1 --begin-time=+5s"
 # Project specific variants for corona
   PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for corona

From 3d79e968348cf63a922c4ead633902611560f8ff Mon Sep 17 00:00:00 2001
From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com>
Date: Tue, 16 Apr 2024 13:03:13 +0200
Subject: [PATCH 040/108] pci queue not recognized in sub-job

---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index b869af6f50..e6da7cecbf 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -47,7 +47,7 @@ variables:
 # Arguments for top level allocation
   TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2"
 # Arguments for job level allocation
-  TIOGA_JOB_ALLOC: "--queue=pci --nodes=1 --begin-time=+5s"
+  TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
 # Project specific variants for corona
   PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests"
 # Project specific deps for corona

From 3385e0f070963f4f741b253d6b8c89207c0a9a44 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Apr 2024 16:13:06 -0700
Subject: [PATCH 041/108] Add more documentation on the exec and reduce
 policies to the reductioun cookbook

---
 .../sphinx/user_guide/cook_book/reduction.rst | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index 309561bc38..5c17e3a626 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -48,14 +48,13 @@ The results of these operations will yield the following values:
 
  * vsum == 1000
 
-Here a simple sum reduction is performed using RAJA::
+RAJA uses policy types to specify how things are implemented.
 
-  using reduce_policy = RAJA::seq_reduce;
-  // using reduce_policy = RAJA::omp_reduce;
-  // using reduce_policy = RAJA::omp_target_reduce;
-  // using reduce_policy = RAJA::cuda_reduce;
-  // using reduce_policy = RAJA::hip_reduce;
-  // using reduce_policy = RAJA::sycl_reduce;
+The forall execution policy specifies how the loop is run in the forall.
+For example ``RAJA::seq_exec`` runs a c-style for loop. The
+``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a cuda kernel with
+256 threads per block and other cuda kernel launch parameters, like the
+number of blocks, optimized for performance with reducers.::
 
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
@@ -64,6 +63,22 @@ Here a simple sum reduction is performed using RAJA::
   // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
+The reduction policy specifies how the reduction is done and must match the
+execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction
+and can only be used with sequential execution policies. The
+``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given
+data type, and can only be used with cuda execution policies.::
+
+  using reduce_policy = RAJA::seq_reduce;
+  // using reduce_policy = RAJA::omp_reduce;
+  // using reduce_policy = RAJA::omp_target_reduce;
+  // using reduce_policy = RAJA::cuda_reduce_atomic;
+  // using reduce_policy = RAJA::hip_reduce_atomic;
+  // using reduce_policy = RAJA::sycl_reduce;
+
+
+Here a simple sum reduction is performed using RAJA::
+
   RAJA::ReduceSum<reduce_policy, int> vsum(0);
 
   RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),

From c8ba75e068180f2b287363bd1250919834491253 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Apr 2024 16:17:00 -0700
Subject: [PATCH 042/108] Add more explanation of rec_for_reduce policy

---
 docs/sphinx/user_guide/feature/policies.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index ad1196237d..5a0670a657 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -269,6 +269,7 @@ policies have the prefix ``hip_``.
  cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Similar to the occ_max
                                                          policy but may use less
                                                          than the maximum occupancy
+                                                         determined by the occupancy calculator
                                                          of the kernel for performance
                                                          reasons.
  cuda/hip_exec_occ_fraction<BLOCK_SIZE,    forall        Similar to the occ_max
@@ -281,7 +282,13 @@ policies have the prefix ``hip_``.
                                                          concretizer.
  cuda/hip_exec_rec_for_reduce<BLOCK_SIZE>  forall        The cuda/hip exec policy
                                                          that is recommended for
-                                                         use with reducers.
+                                                         use with reducers. In general using
+                                                         the occupancy calculator policies
+                                                         are better but exactly how much
+                                                         occupancy to use differs by platform
+                                                         so this policy provides a simple way
+                                                         to get what works best for that platform
+                                                         without having to know the details.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed

From b966a222357ff17820af1639043e627f9670a610 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Apr 2024 16:19:13 -0700
Subject: [PATCH 043/108] Improve cuda/hip concretizer docs

---
 docs/sphinx/user_guide/feature/policies.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 5a0670a657..3554873a08 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -431,7 +431,7 @@ policies have the prefix ``hip_``.
 
 When a cuda/hip policy leaves parameters like the block size and/or grid size
 unspecified a concretizer object is used to decide those parameters. The
-following concretizers are available to use in the cuda/hip_exec_occ_custom
+following concretizers are available to use in the ``cuda/hip_exec_occ_custom``
 policies:
 
 =================================================== =========================================
@@ -451,12 +451,12 @@ Cuda/HipMaxOccupancyConcretizer                     Uses max occupancy.
 Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer    Avoids using the max occupancy of the
                                                     device in terms of threads.
                                                     Note that it may use the max occupancy
-                                                    of the function if that is below the max
+                                                    of the kernel if that is below the max
                                                     occupancy of the device.
 
 Cuda/HipFractionOffsetOccupancyConcretizer<         Uses a fraction and offset to choose an
     Fraction<size_t, numerator, denomenator>,       occupancy based on the max occupancy
-    BLOCKS_PER_SM_OFFSET>                           Using the following formula.
+    BLOCKS_PER_SM_OFFSET>                           Using the following formula:
                                                     (Fraction * kernel_max_blocks_per_sm +
                                                      BLOCKS_PER_SM_OFFSET) * sm_per_device
 

From bad690723e2e5468d261b4778c084cc3541aa9f9 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 16 Apr 2024 16:45:43 -0700
Subject: [PATCH 044/108] Improve docs in MemUtils_CUDA/HIP

---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 73 ++++++++++++++--------
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 51 +++++++++------
 2 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 7eee19dacf..54a8a7e008 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -279,6 +279,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
+//! Get the properties of the current device
 RAJA_INLINE
 cudaDeviceProp get_device_prop()
 {
@@ -289,6 +290,7 @@ cudaDeviceProp get_device_prop()
   return prop;
 }
 
+//! Get a cached copy of the device properties
 RAJA_INLINE
 cudaDeviceProp& device_prop()
 {
@@ -297,12 +299,14 @@ cudaDeviceProp& device_prop()
 }
 
 
+//! Struct with the maximum theoretical occupancy of the device
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device;
   int device_max_threads_per_sm;
 };
 
+//! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
 CudaFixedMaxBlocksData cuda_max_blocks()
 {
@@ -313,6 +317,7 @@ CudaFixedMaxBlocksData cuda_max_blocks()
   return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in simple terms
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block;
@@ -320,15 +325,18 @@ struct CudaOccMaxBlocksThreadsData
   int func_max_threads_per_block;
 };
 
+//! Get the maximum occupancy of a kernel with unknown threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
 CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
+  static constexpr int uninitialized_int = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksThreadsData data {
-      std::numeric_limits<size_t>::max(),
-      -1,
-      -1 };
+      uninitialized_size_t,
+      uninitialized_int,
+      uninitialized_int };
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
@@ -342,6 +350,7 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
   return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in specific terms
 struct CudaOccMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block;
@@ -351,17 +360,20 @@ struct CudaOccMaxBlocksData
   int func_max_blocks_per_sm;
 };
 
+//! Get the maximum occupancy of a kernel with compile time threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
 CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
+  static constexpr int uninitialized_int = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksData data {
-      std::numeric_limits<size_t>::max(),
+      uninitialized_size_t,
       func_threads_per_block,
       cuda::device_prop().multiProcessorCount,
       cuda::device_prop().maxThreadsPerMultiProcessor,
-      -1 };
+      uninitialized_int };
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
@@ -375,17 +387,20 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
   return data;
 }
 
+//! Get the maximum occupancy of a kernel with runtime threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
 CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
+  static constexpr int uninitialized_int = -1;
+  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
   static thread_local CudaOccMaxBlocksData data {
-      std::numeric_limits<size_t>::max(),
-      -1,
+      uninitialized_size_t,
+      uninitialized_int,
       cuda::device_prop().multiProcessorCount,
       cuda::device_prop().maxThreadsPerMultiProcessor,
-      -1 };
+      uninitialized_int };
 
   if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
        data.func_threads_per_block != func_threads_per_block ) {
@@ -401,17 +416,31 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
   return data;
 }
 
+
 /*!
  ******************************************************************************
  *
- * \brief  Cuda Concretizer Implementation.
+ * \brief  Concretizer Implementation that chooses block size and/or grid
+ *         size when they has not been specified at compile time.
  *
  * \tparam IdxT Index type to use for integer calculations.
- * \tparam Concretizer Class the determines the max number of blocks to use when
- *         fitting for the device.
+ * \tparam Concretizer Class that determines the max number of blocks to use
+ *         when fitting for the device.
  * \tparam UniqueMarker A type that is unique to each global function, used to
  *         help cache the occupancy data for that global function.
  *
+ * The methods come in two flavors:
+ * - The fit_len methods choose grid and block sizes that result in a total
+ *   number of threads of at least the len given in the constructor or 0 if
+ *   that is not possible.
+ * - The fit_device methods choose grid and block sizes that best fit the
+ *   occupancy of the global function according to the occupancy calculator and
+ *   the Concretizer class.
+ *
+ * Common terms:
+ * - block size - threads per block
+ * - grid size - blocks per device
+ *
  ******************************************************************************
  */
 template < typename IdxT, typename Concretizer, typename UniqueMarker>
@@ -423,7 +452,6 @@ struct ConcretizerImpl
     , m_len(len)
   { }
 
-  // Get the maximum block size
   IdxT get_max_block_size() const
   {
     auto data = cuda_occupancy_max_blocks_threads<UniqueMarker>(
@@ -432,8 +460,7 @@ struct ConcretizerImpl
     return func_max_threads_per_block;
   }
 
-  // Get a block size that combined with the given grid size is large enough
-  // to do len work, or 0 if not possible
+  //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -445,16 +472,14 @@ struct ConcretizerImpl
     }
   }
 
-  // Get a grid size that combined with the given block size is large enough
-  // to do len work
+  //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
     IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
-  // Get a block size and grid size that combined is large enough
-  // to do len work
+  //! Get a block size and grid size when neither is specified
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -463,9 +488,7 @@ struct ConcretizerImpl
                           func_blocks_per_device);
   }
 
-  // Get a block size that combined with the given grid size is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -473,9 +496,7 @@ struct ConcretizerImpl
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
-  // Get a grid size that combined with the given block size is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
@@ -485,9 +506,7 @@ struct ConcretizerImpl
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  // Get a block size and grid size that combined is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a block size and grid size when neither is specified
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 9b8442637b..bfb07bc569 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -281,6 +281,7 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
+//! Get the properties of the current device
 RAJA_INLINE
 hipDeviceProp_t get_device_prop()
 {
@@ -291,6 +292,7 @@ hipDeviceProp_t get_device_prop()
   return prop;
 }
 
+//! Get a cached copy of the device properties
 RAJA_INLINE
 hipDeviceProp_t& device_prop()
 {
@@ -299,12 +301,14 @@ hipDeviceProp_t& device_prop()
 }
 
 
+//! Struct with the maximum theoretical occupancy of the device
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device;
   int device_max_threads_per_sm;
 };
 
+//! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
 HipFixedMaxBlocksData hip_max_blocks()
 {
@@ -315,6 +319,7 @@ HipFixedMaxBlocksData hip_max_blocks()
   return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in simple terms
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block;
@@ -322,6 +327,7 @@ struct HipOccMaxBlocksThreadsData
   int func_max_threads_per_block;
 };
 
+//! Get the maximum occupancy of a kernel with unknown threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
 HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
@@ -351,6 +357,7 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
   return data;
 }
 
+//! Struct with the maximum occupancy of a kernel in specific terms
 struct HipOccMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block;
@@ -360,6 +367,7 @@ struct HipOccMaxBlocksData
   int func_max_blocks_per_sm;
 };
 
+//! Get the maximum occupancy of a kernel with compile time threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
 RAJA_INLINE
 HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
@@ -391,6 +399,7 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
   return data;
 }
 
+//! Get the maximum occupancy of a kernel with runtime threads per block
 template < typename RAJA_UNUSED_ARG(UniqueMarker) >
 RAJA_INLINE
 HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
@@ -423,17 +432,31 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
   return data;
 }
 
+
 /*!
  ******************************************************************************
  *
- * \brief  Hip Concretizer Implementation.
+ * \brief  Concretizer Implementation that chooses block size and/or grid
+ *         size when they has not been specified at compile time.
  *
  * \tparam IdxT Index type to use for integer calculations.
- * \tparam Concretizer Class the determines the max number of blocks to use when
- *         fitting for the device.
+ * \tparam Concretizer Class that determines the max number of blocks to use
+ *         when fitting for the device.
  * \tparam UniqueMarker A type that is unique to each global function, used to
  *         help cache the occupancy data for that global function.
  *
+ * The methods come in two flavors:
+ * - The fit_len methods choose grid and block sizes that result in a total
+ *   number of threads of at least the len given in the constructor or 0 if
+ *   that is not possible.
+ * - The fit_device methods choose grid and block sizes that best fit the
+ *   occupancy of the global function according to the occupancy calculator and
+ *   the Concretizer class.
+ *
+ * Common terms:
+ * - block size - threads per block
+ * - grid size - blocks per device
+ *
  ******************************************************************************
  */
 template < typename IdxT, typename Concretizer, typename UniqueMarker>
@@ -445,7 +468,6 @@ struct ConcretizerImpl
     , m_len(len)
   { }
 
-  // Get the maximum block size
   IdxT get_max_block_size() const
   {
     auto data = hip_occupancy_max_blocks_threads<UniqueMarker>(
@@ -454,8 +476,7 @@ struct ConcretizerImpl
     return func_max_threads_per_block;
   }
 
-  // Get a block size that combined with the given grid size is large enough
-  // to do len work, or 0 if not possible
+  //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -467,16 +488,14 @@ struct ConcretizerImpl
     }
   }
 
-  // Get a grid size that combined with the given block size is large enough
-  // to do len work
+  //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
     IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
-  // Get a block size and grid size that combined is large enough
-  // to do len work
+  //! Get a block size and grid size when neither is specified
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -485,9 +504,7 @@ struct ConcretizerImpl
                           func_blocks_per_device);
   }
 
-  // Get a block size that combined with the given grid size is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
@@ -495,9 +512,7 @@ struct ConcretizerImpl
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
-  // Get a grid size that combined with the given block size is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
@@ -507,9 +522,7 @@ struct ConcretizerImpl
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
-  // Get a block size and grid size that combined is the smaller of
-  // the amount need to achieve maximum occupancy on the device or
-  // the amount needed to do len work
+  //! Get a block size and grid size when neither is specified
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();

From 06b33c6b3edc9de102acc710fe9cab3da6c2b241 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 17 Apr 2024 07:49:52 -0700
Subject: [PATCH 045/108] fixup fraction static_assert

---
 include/RAJA/util/types.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 011082953d..03cd3b3deb 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -196,7 +196,7 @@ struct SizeList {
 template <typename int_t, int_t numerator, int_t denominator>
 struct Fraction
 {
-  static_assert(denominator != int_t(0), "denominator may not be zero");
+  static_assert(denominator != int_t(0), "denominator must not be zero");
 
   using inverse = Fraction<int_t, denominator, numerator>;
 

From 33c92b508ebfeb7546f5ee6164b91476768268d6 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <artv3.89@gmail.com>
Date: Wed, 17 Apr 2024 09:42:18 -0700
Subject: [PATCH 046/108] outmost index is the fastest index in sycl, swap the
 order around

---
 include/RAJA/policy/sycl/launch.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 0dffee6a21..9176444cd4 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -56,13 +56,13 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[0],
+    const ::sycl::range<3> blockSize(params.threads.value[2],
 				     params.threads.value[1],
-				     params.threads.value[2]);
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
 				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[2] * params.teams.value[2]);
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
@@ -138,13 +138,13 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
 
-    const ::sycl::range<3> blockSize(params.threads.value[0],
+    const ::sycl::range<3> blockSize(params.threads.value[2],
 				     params.threads.value[1],
-				     params.threads.value[2]);
+				     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0],
+    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
 				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[2] * params.teams.value[2]);
+				    params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;

From 31ff12a79118ae0ccdd2d37a535890b490d3332a Mon Sep 17 00:00:00 2001
From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com>
Date: Wed, 17 Apr 2024 19:49:52 +0200
Subject: [PATCH 047/108] Apply changes required by LC (token handling)

---
 .gitlab-ci.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 8c1f7a472e..fb6bc7055c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -75,7 +75,7 @@ stages:
     include:
       - local: '.gitlab/custom-jobs-and-variables.yml'
       - project: 'radiuss/radiuss-shared-ci'
-        ref: 'v2023.12.3'
+        ref: 'v2024.04.0'
         file: 'pipelines/${CI_MACHINE}.yml'
       - artifact: '${CI_MACHINE}-jobs.yml'
         job: 'generate-job-lists'
@@ -100,9 +100,11 @@ trigger-rajaperf:
     strategy: depend
 
 include:
+  - project: 'lc-templates/id_tokens'
+    file: 'id_tokens.yml'
   # [Optional] checks preliminary to running the actual CI test
   - project: 'radiuss/radiuss-shared-ci'
-    ref: 'v2023.12.3'
+    ref: 'v2024.04.0'
     file: 'utilities/preliminary-ignore-draft-pr.yml'
   # pipelines subscribed by the project
   - local: '.gitlab/subscribed-pipelines.yml'

From 3b424476a0a8b2e08fe6f327087ecfffba0b7df2 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <artv3.89@gmail.com>
Date: Wed, 17 Apr 2024 10:54:12 -0700
Subject: [PATCH 048/108] swap ordering of index as sycl uses a c-style
 convection

---
 .../RAJA_test-launch-direct-teams-threads-1D-execpol.hpp  | 4 ++--
 .../RAJA_test-launch-direct-teams-threads-3D-execpol.hpp  | 8 ++++----
 test/include/RAJA_test-launch-execpol.hpp                 | 2 +-
 .../RAJA_test-launch-loop-teams-threads-1D-execpol.hpp    | 4 ++--
 .../RAJA_test-launch-loop-teams-threads-3D-execpol.hpp    | 8 ++++----
 test/include/RAJA_test-launch-runtime-execpol.hpp         | 8 ++++----
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
index 5b5dfdbebf..7179e48fdc 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp
@@ -81,8 +81,8 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 using sycl_direct_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
             >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
index 38bc4c8bb0..f84823e414 100644
--- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp
@@ -100,12 +100,12 @@ using Hip_launch_policies = camp::list<hip_direct_policies>;
 using sycl_direct_policies = 
   camp::list<
              RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>, //slowest
              RAJA::LoopPolicy<RAJA::sycl_group_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>,
+             RAJA::LoopPolicy<RAJA::sycl_group_2_direct>, //fastest
+             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>,
              RAJA::LoopPolicy<RAJA::sycl_local_1_direct>,
-             RAJA::LoopPolicy<RAJA::sycl_local_0_direct>
+             RAJA::LoopPolicy<RAJA::sycl_local_2_direct>
             >;
 
 using Sycl_launch_policies = camp::list<sycl_direct_policies>;
diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp
index 9961cd0741..fea90a8305 100644
--- a/test/include/RAJA_test-launch-execpol.hpp
+++ b/test/include/RAJA_test-launch-execpol.hpp
@@ -68,7 +68,7 @@ using Hip_launch_policies = camp::list<
 
 using sycl_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_global_item_0>>;
+  RAJA::LoopPolicy<RAJA::sycl_global_item_2>>;
 
 using Sycl_launch_policies = camp::list<
       sycl_policies
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
index 9e5779853c..6173fc6ffa 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp
@@ -75,8 +75,8 @@ using Hip_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_SYCL)
 using sycl_loop_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
   >;
 
 using Sycl_launch_policies = camp::list<  
diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
index 9d217757b2..d703216a13 100644
--- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
+++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp
@@ -95,12 +95,12 @@ using Hip_launch_policies = camp::list<
 #if defined(RAJA_ENABLE_SYCL)
 using sycl_loop_policies = camp::list<
   RAJA::LaunchPolicy<RAJA::sycl_launch_t<true>>,
-  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>, //slowest index
   RAJA::LoopPolicy<RAJA::sycl_group_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_group_0_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>,
+  RAJA::LoopPolicy<RAJA::sycl_group_2_loop>, //fastest index
+  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>,
   RAJA::LoopPolicy<RAJA::sycl_local_1_loop>,
-  RAJA::LoopPolicy<RAJA::sycl_local_0_loop>
+  RAJA::LoopPolicy<RAJA::sycl_local_2_loop>
   >;
 
 using Sycl_launch_policies = camp::list<  
diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp
index bec07358e6..fa2b39f761 100644
--- a/test/include/RAJA_test-launch-runtime-execpol.hpp
+++ b/test/include/RAJA_test-launch-runtime-execpol.hpp
@@ -52,8 +52,8 @@ using Sequential_launch_policies = camp::list<seq_hip_policies>;
 using seq_sycl_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::seq_launch_t,RAJA::sycl_launch_t<true>>,
-             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_0_loop>
+             RAJA::LoopPolicy<RAJA::seq_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
             >;
 
 using Sequential_launch_policies = camp::list<seq_sycl_policies>;
@@ -110,8 +110,8 @@ using OpenMP_launch_policies = camp::list<omp_hip_policies>;
 using omp_sycl_policies =
   camp::list<
              RAJA::LaunchPolicy<RAJA::omp_launch_t,RAJA::sycl_launch_t<false>>,
-             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_0_direct>,
-             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_0_loop>
+             RAJA::LoopPolicy<RAJA::omp_for_exec, RAJA::sycl_group_2_direct>,
+             RAJA::LoopPolicy<RAJA::seq_exec,RAJA::sycl_local_2_loop>
             >;
 
 using OpenMP_launch_policies = camp::list<omp_sycl_policies>;

From 27ec80d4c44e1c409519a9e23d5c1eb23a14c6bd Mon Sep 17 00:00:00 2001
From: Jason Burmark <MrBurmark@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:12:05 -0700
Subject: [PATCH 049/108] Apply suggestions from code review to docs

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 docs/sphinx/user_guide/cook_book.rst           |  3 +--
 docs/sphinx/user_guide/cook_book/reduction.rst | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst
index 44c89c3d51..91494f3674 100644
--- a/docs/sphinx/user_guide/cook_book.rst
+++ b/docs/sphinx/user_guide/cook_book.rst
@@ -14,8 +14,7 @@ RAJA Cook Book
 
 The following sections show common use case patterns and the recommended
 RAJA features and policies to use with them. They are intended
-for users to copy and paste into their code and provide guidance on
-which policy to use with each backend to get good performance.
+to provide users with complete beyond usage examples beyond what can be found in other parts of the RAJA User Guide. In particular, the examples and discussion provide guidance on RAJA execution policy selection to improve performance of user application codes.
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index 5c17e3a626..e8925ee019 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -12,7 +12,7 @@
 Cooking with Reductions
 =======================
 
-Please see the following section for more info on RAJA reductions:
+Please see the following section for overview discussion about RAJA reductions:
 
  * :ref:`feat-reductions-label`.
 
@@ -50,10 +50,10 @@ The results of these operations will yield the following values:
 
 RAJA uses policy types to specify how things are implemented.
 
-The forall execution policy specifies how the loop is run in the forall.
-For example ``RAJA::seq_exec`` runs a c-style for loop. The
-``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a cuda kernel with
-256 threads per block and other cuda kernel launch parameters, like the
+The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
+For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
+``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with
+256 threads per block and other CUDA kernel launch parameters, like the
 number of blocks, optimized for performance with reducers.::
 
   using exec_policy = RAJA::seq_exec;
@@ -67,7 +67,7 @@ The reduction policy specifies how the reduction is done and must match the
 execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction
 and can only be used with sequential execution policies. The
 ``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given
-data type, and can only be used with cuda execution policies.::
+data type, and can only be used with cuda execution policies. Similarly for other RAJA execution back-ends, such as HIP and OpenMP. Here are example RAJA reduction policies whose names are indicative of which execution policies they work with::
 
   using reduce_policy = RAJA::seq_reduce;
   // using reduce_policy = RAJA::omp_reduce;

From 2bd50afd92ccdf7fd3ed114f18e9446f3b735249 Mon Sep 17 00:00:00 2001
From: Jason Burmark <MrBurmark@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:13:05 -0700
Subject: [PATCH 050/108] Apply suggestions from code review

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 3554873a08..3b95b8e153 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -429,7 +429,7 @@ policies have the prefix ``hip_``.
                                                          thread warp.
  ========================================= ============= =======================================
 
-When a cuda/hip policy leaves parameters like the block size and/or grid size
+When a CUDA or HIP policy leaves parameters like the block size and/or grid size
 unspecified a concretizer object is used to decide those parameters. The
 following concretizers are available to use in the ``cuda/hip_exec_occ_custom``
 policies:

From c8df5e552cd3ba17d482fdbf33c75f045daf4a0c Mon Sep 17 00:00:00 2001
From: Jason Burmark <MrBurmark@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:19:46 -0700
Subject: [PATCH 051/108] Apply suggestions from code review

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 4 ++--
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 54a8a7e008..95dbd4bbba 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -290,7 +290,7 @@ cudaDeviceProp get_device_prop()
   return prop;
 }
 
-//! Get a cached copy of the device properties
+//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
 RAJA_INLINE
 cudaDeviceProp& device_prop()
 {
@@ -421,7 +421,7 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  ******************************************************************************
  *
  * \brief  Concretizer Implementation that chooses block size and/or grid
- *         size when they has not been specified at compile time.
+ *         size when one or both has not been specified at compile time.
  *
  * \tparam IdxT Index type to use for integer calculations.
  * \tparam Concretizer Class that determines the max number of blocks to use
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index bfb07bc569..af2a39c191 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -292,7 +292,7 @@ hipDeviceProp_t get_device_prop()
   return prop;
 }
 
-//! Get a cached copy of the device properties
+//! Get a copy of the device properties, this copy is cached on first use to speedup later calls
 RAJA_INLINE
 hipDeviceProp_t& device_prop()
 {
@@ -437,7 +437,7 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  ******************************************************************************
  *
  * \brief  Concretizer Implementation that chooses block size and/or grid
- *         size when they has not been specified at compile time.
+ *         size when one or both has not been specified at compile time.
  *
  * \tparam IdxT Index type to use for integer calculations.
  * \tparam Concretizer Class that determines the max number of blocks to use

From a03f647cbdee0c1171b572f44bf8bb6ceb09e841 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 14:59:43 -0700
Subject: [PATCH 052/108] Put initialization in occupancy cacl data structs

---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 60 ++++++++--------------
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 55 ++++++++------------
 2 files changed, 42 insertions(+), 73 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 95dbd4bbba..4e85f948e8 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -294,25 +294,27 @@ cudaDeviceProp get_device_prop()
 RAJA_INLINE
 cudaDeviceProp& device_prop()
 {
-  static cudaDeviceProp prop = get_device_prop();
+  static thread_local cudaDeviceProp prop = get_device_prop();
   return prop;
 }
 
 
+static constexpr int cuda_occupancy_uninitialized_int = -1;
+static constexpr size_t cuda_occupancy_uninitialized_size_t =
+    std::numeric_limits<size_t>::max();
+
 //! Struct with the maximum theoretical occupancy of the device
 struct CudaFixedMaxBlocksData
 {
-  int device_sm_per_device;
-  int device_max_threads_per_sm;
+  int device_sm_per_device = cuda::device_prop().multiProcessorCount;
+  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
 CudaFixedMaxBlocksData cuda_max_blocks()
 {
-  static thread_local CudaFixedMaxBlocksData data {
-      cuda::device_prop().multiProcessorCount,
-      cuda::device_prop().maxThreadsPerMultiProcessor };
+  static thread_local CudaFixedMaxBlocksData data;
 
   return data;
 }
@@ -320,9 +322,9 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 //! Struct with the maximum occupancy of a kernel in simple terms
 struct CudaOccMaxBlocksThreadsData
 {
-  size_t func_dynamic_shmem_per_block;
-  int func_max_blocks_per_device;
-  int func_max_threads_per_block;
+  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
+  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
@@ -331,33 +333,26 @@ RAJA_INLINE
 CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized_int = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksThreadsData data {
-      uninitialized_size_t,
-      uninitialized_int,
-      uninitialized_int };
+  static thread_local CudaOccMaxBlocksThreadsData data;
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
         &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
 
-    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-
   }
 
   return data;
 }
 
 //! Struct with the maximum occupancy of a kernel in specific terms
-struct CudaOccMaxBlocksData
+struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
-  size_t func_dynamic_shmem_per_block;
-  int func_threads_per_block;
-  int device_sm_per_device;
-  int device_max_threads_per_sm;
-  int func_max_blocks_per_sm;
+  size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
+  int func_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -366,18 +361,12 @@ RAJA_INLINE
 CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
-  static constexpr int uninitialized_int = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksData data {
-      uninitialized_size_t,
-      func_threads_per_block,
-      cuda::device_prop().multiProcessorCount,
-      cuda::device_prop().maxThreadsPerMultiProcessor,
-      uninitialized_int };
+  static thread_local CudaOccMaxBlocksData data;
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
@@ -393,14 +382,7 @@ RAJA_INLINE
 CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static constexpr int uninitialized_int = -1;
-  static constexpr size_t uninitialized_size_t = std::numeric_limits<size_t>::max();
-  static thread_local CudaOccMaxBlocksData data {
-      uninitialized_size_t,
-      uninitialized_int,
-      cuda::device_prop().multiProcessorCount,
-      cuda::device_prop().maxThreadsPerMultiProcessor,
-      uninitialized_int };
+  static thread_local CudaOccMaxBlocksData data;
 
   if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
        data.func_threads_per_block != func_threads_per_block ) {
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index af2a39c191..82b7bfc633 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -296,25 +296,27 @@ hipDeviceProp_t get_device_prop()
 RAJA_INLINE
 hipDeviceProp_t& device_prop()
 {
-  static hipDeviceProp_t prop = get_device_prop();
+  static thread_local hipDeviceProp_t prop = get_device_prop();
   return prop;
 }
 
 
+static constexpr int hip_occupancy_uninitialized_int = -1;
+static constexpr size_t hip_occupancy_uninitialized_size_t =
+    std::numeric_limits<size_t>::max();
+
 //! Struct with the maximum theoretical occupancy of the device
 struct HipFixedMaxBlocksData
 {
-  int device_sm_per_device;
-  int device_max_threads_per_sm;
+  int device_sm_per_device = hip::device_prop().multiProcessorCount;
+  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
 RAJA_INLINE
 HipFixedMaxBlocksData hip_max_blocks()
 {
-  static thread_local HipFixedMaxBlocksData data {
-      hip::device_prop().multiProcessorCount,
-      hip::device_prop().maxThreadsPerMultiProcessor };
+  static thread_local HipFixedMaxBlocksData data;
 
   return data;
 }
@@ -322,9 +324,9 @@ HipFixedMaxBlocksData hip_max_blocks()
 //! Struct with the maximum occupancy of a kernel in simple terms
 struct HipOccMaxBlocksThreadsData
 {
-  size_t func_dynamic_shmem_per_block;
-  int func_max_blocks_per_device;
-  int func_max_threads_per_block;
+  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
+  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
@@ -333,13 +335,12 @@ RAJA_INLINE
 HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
-  static thread_local HipOccMaxBlocksThreadsData data {
-      std::numeric_limits<size_t>::max(),
-      -1,
-      -1 };
+  static thread_local HipOccMaxBlocksThreadsData data;
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
+    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
         &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
@@ -350,21 +351,17 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
     data.func_max_threads_per_block = 1024;
 #endif
 
-    data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-
   }
 
   return data;
 }
 
 //! Struct with the maximum occupancy of a kernel in specific terms
-struct HipOccMaxBlocksData
+struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
-  size_t func_dynamic_shmem_per_block;
-  int func_threads_per_block;
-  int device_sm_per_device;
-  int device_max_threads_per_sm;
-  int func_max_blocks_per_sm;
+  size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
+  int func_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
@@ -373,16 +370,12 @@ RAJA_INLINE
 HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block)
 {
-  static thread_local HipOccMaxBlocksData data {
-      std::numeric_limits<size_t>::max(),
-      func_threads_per_block,
-      hip::device_prop().multiProcessorCount,
-      hip::device_prop().maxThreadsPerMultiProcessor,
-      -1 };
+  static thread_local HipOccMaxBlocksData data;
 
   if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
+    data.func_threads_per_block = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -393,7 +386,6 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
     if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
 #endif
 
-
   }
 
   return data;
@@ -405,12 +397,7 @@ RAJA_INLINE
 HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
     size_t func_dynamic_shmem_per_block, int func_threads_per_block)
 {
-  static thread_local HipOccMaxBlocksData data {
-      std::numeric_limits<size_t>::max(),
-      -1,
-      hip::device_prop().multiProcessorCount,
-      hip::device_prop().maxThreadsPerMultiProcessor,
-      -1 };
+  static thread_local HipOccMaxBlocksData data;
 
   if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
        data.func_threads_per_block != func_threads_per_block ) {

From 36fe701474273235f10b8d2375c01981adb9728e Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 24 Apr 2024 14:53:58 -0700
Subject: [PATCH 053/108] Add a for_each impl with overload for camp::list of
 types

---
 include/RAJA/RAJA.hpp                         |   1 +
 include/RAJA/util/for_each.hpp                |  95 +++++++++++
 test/unit/algorithm/CMakeLists.txt            |   5 +
 .../test-algorithm-util-for_each.cpp          | 150 ++++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 include/RAJA/util/for_each.hpp
 create mode 100644 test/unit/algorithm/test-algorithm-util-for_each.cpp

diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index f41aad477b..32522a1f0d 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -35,6 +35,7 @@
 #include "RAJA/util/types.hpp"
 #include "RAJA/util/plugins.hpp"
 #include "RAJA/util/Registry.hpp"
+#include "RAJA/util/for_each.hpp"
 
 
 //
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
new file mode 100644
index 0000000000..c95f40da35
--- /dev/null
+++ b/include/RAJA/util/for_each.hpp
@@ -0,0 +1,95 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA for_each templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_for_each_HPP
+#define RAJA_util_for_each_HPP
+
+#include "RAJA/config.hpp"
+
+#include <iterator>
+#include <type_traits>
+
+#include "camp/list.hpp"
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/types.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+// runtime loop applying func to each element in the range in order
+template<typename Iter, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+{
+  for (; begin != end; ++begin) {
+    func(*begin);
+  }
+
+  return func;
+}
+
+// compile time expansion applying func to a each type in the list in order
+template <typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+{
+  // braced init lists are evaluated in order
+  int seq_unused_array[] = {(func(Ts{}), 0)...};
+  RAJA_UNUSED_VAR(seq_unused_array);
+
+  return func;
+}
+
+}  // namespace detail
+
+
+/*!
+  \brief Apply func to all the elements in the given range in order
+  using a sequential for loop in O(N) operations and O(1) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/for_each
+*/
+template <typename Container, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    for_each(Container&& c, UnaryFunc func)
+{
+  using std::begin;
+  using std::end;
+
+  return detail::for_each(begin(c), end(c), std::move(func));
+}
+
+/*!
+  \brief Apply func to each type in the given list in order
+  using a compile-time expansion in O(N) operations and O(1) extra memory
+*/
+template <typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE
+UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+{
+  return detail::for_each_type(c, std::move(func));
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt
index 856e4519b6..0142a94ed3 100644
--- a/test/unit/algorithm/CMakeLists.txt
+++ b/test/unit/algorithm/CMakeLists.txt
@@ -88,3 +88,8 @@ unset( SORT_BACKENDS )
 unset( SEQUENTIAL_UTIL_SORTS )
 unset( CUDA_UTIL_SORTS )
 unset( HIP_UTIL_SORTS )
+
+
+raja_add_test(
+  NAME test-algorithm-util-for_each
+  SOURCES test-algorithm-util-for_each.cpp)
diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp
new file mode 100644
index 0000000000..db918ad234
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp
@@ -0,0 +1,150 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing unit tests for for_each
+///
+
+#include "RAJA_test-base.hpp"
+
+#include "RAJA_unit-test-types.hpp"
+
+#include "camp/resource.hpp"
+
+#include <type_traits>
+#include <vector>
+#include <set>
+
+template<typename T>
+class ForEachUnitTest : public ::testing::Test {};
+
+TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes);
+
+
+TYPED_TEST(ForEachUnitTest, EmptyRange)
+{
+  std::vector<TypeParam> numbers;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    number += 1;
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 0);
+  ASSERT_EQ(numbers.size(), 0);
+}
+
+TYPED_TEST(ForEachUnitTest, VectorRange)
+{
+  std::vector<TypeParam> numbers;
+  for (TypeParam i = 0; i < 13; ++i) {
+    numbers.push_back(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
+
+  ASSERT_EQ(copies.size(), 13);
+  for (TypeParam i = 0; i < 13; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
+  }
+}
+
+TYPED_TEST(ForEachUnitTest, RajaSpanRange)
+{
+  std::vector<TypeParam> numbers;
+  for (TypeParam i = 0; i < 11; ++i) {
+    numbers.push_back(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) {
+    copies.push_back(number);
+    number += 1;
+  });
+
+  ASSERT_EQ(copies.size(), 11);
+  for (TypeParam i = 0; i < 11; ++i) {
+    ASSERT_EQ(numbers[i], copies[i]+1);
+  }
+}
+
+TYPED_TEST(ForEachUnitTest, SetRange)
+{
+  std::set<TypeParam> numbers;
+  for (TypeParam i = 0; i < 6; ++i) {
+    numbers.insert(i);
+  }
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each(numbers, [&](TypeParam const& number) {
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 6);
+  for (TypeParam i = 0; i < 6; ++i) {
+    ASSERT_EQ(i, copies[i]);
+    ASSERT_EQ(numbers.count(i), 1);
+  }
+}
+
+
+TYPED_TEST(ForEachUnitTest, EmptyTypeList)
+{
+  using numbers = camp::list<>;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(number);
+  });
+
+  ASSERT_EQ(copies.size(), 0);
+}
+
+
+template < typename T, T val >
+T get_num(std::integral_constant<T, val>)
+{
+  return val;
+}
+
+template < typename TypeParam,
+           std::enable_if_t<std::is_integral<TypeParam>::value>* = nullptr >
+void run_int_type_test()
+{
+  using numbers = camp::list<std::integral_constant<TypeParam, 0>,
+                             std::integral_constant<TypeParam, 1>,
+                             std::integral_constant<TypeParam, 2>,
+                             std::integral_constant<TypeParam, 3>,
+                             std::integral_constant<TypeParam, 4>>;
+
+  std::vector<TypeParam> copies;
+  RAJA::for_each_type(numbers{}, [&](auto number) {
+    copies.push_back(get_num(number));
+  });
+
+  ASSERT_EQ(copies.size(), 5);
+  for (TypeParam i = 0; i < 5; ++i) {
+    ASSERT_EQ(i, copies[i]);
+  }
+}
+///
+template < typename TypeParam,
+           std::enable_if_t<!std::is_integral<TypeParam>::value>* = nullptr >
+void run_int_type_test()
+{
+  // ignore non-ints
+}
+
+TYPED_TEST(ForEachUnitTest, IntTypeList)
+{
+  run_int_type_test<TypeParam>();
+}

From 76760fe478e587909282f62242ffa5faba22c5eb Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sat, 27 Apr 2024 14:18:10 -0700
Subject: [PATCH 054/108] Fix zero sized array

---
 include/RAJA/util/for_each.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index c95f40da35..b279ec29ff 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -54,7 +54,7 @@ RAJA_HOST_DEVICE RAJA_INLINE
 UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {(func(Ts{}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;

From b360da9e9b1111be6f266cfcaed62ff7accc8deb Mon Sep 17 00:00:00 2001
From: artv3 <vargas45@llnl.gov>
Date: Tue, 30 Apr 2024 10:36:28 -0700
Subject: [PATCH 055/108] add note about thread ordering

---
 docs/sphinx/user_guide/feature/policies.rst | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 3b95b8e153..5affa42203 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -525,9 +525,24 @@ write more explicit policies.
             unspecified so a runtime number of threads is used, but grid_size is
             ignored so blocks are ignored when getting indices.
 
+	    
 GPU Policies for SYCL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+.. note:: SYCL uses C++-style ordering in which the right
+	  most index corresponds to having unit stride.
+	  In a three-dimensional compute grid this means
+	  that dimension 2 has the unit stride while
+	  dimension 0 has the longest stride. This is
+	  important to note as the ordering is reverse
+	  compared to the CUDA and HIP programming models.   
+
+	  When using RAJA launch thread and team configuration
+	  follows CUDA and HIP programming models and is always
+	  configured in three-dimensions. This means that dimension
+	  2 always exist and should be used as one would the
+	  x dimension for CUDA and HIP.
+
  ======================================== ============= ==============================
  SYCL Execution Policies                  Works with    Brief description
  ======================================== ============= ==============================

From bad63908c57d83005b93ef7bbe8c6b5e8f5c874e Mon Sep 17 00:00:00 2001
From: artv3 <vargas45@llnl.gov>
Date: Tue, 30 Apr 2024 10:39:08 -0700
Subject: [PATCH 056/108] more docs

---
 docs/sphinx/user_guide/feature/policies.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 5affa42203..107bc27af2 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -524,7 +524,6 @@ write more explicit policies.
             ignored. For example in cuda_thread_x_direct block_size is
             unspecified so a runtime number of threads is used, but grid_size is
             ignored so blocks are ignored when getting indices.
-
 	    
 GPU Policies for SYCL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -535,7 +534,9 @@ GPU Policies for SYCL
 	  that dimension 2 has the unit stride while
 	  dimension 0 has the longest stride. This is
 	  important to note as the ordering is reverse
-	  compared to the CUDA and HIP programming models.   
+	  compared to the CUDA and HIP programming models.
+	  CUDA and HIP employ a x/y/z ordering in which
+	  dimension x has the unit stride.
 
 	  When using RAJA launch thread and team configuration
 	  follows CUDA and HIP programming models and is always

From 5ddd86a8c631b78e4bac500458db1d394c3b3af0 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Apr 2024 10:56:15 -0700
Subject: [PATCH 057/108] Update docs/sphinx/user_guide/feature/policies.rst

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 107bc27af2..b3ac763cc9 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -541,7 +541,7 @@ GPU Policies for SYCL
 	  When using RAJA launch thread and team configuration
 	  follows CUDA and HIP programming models and is always
 	  configured in three-dimensions. This means that dimension
-	  2 always exist and should be used as one would the
+	  2 always exists and should be used as one would use the
 	  x dimension for CUDA and HIP.
 
  ======================================== ============= ==============================

From aef818c91dee2a78dd12aeedc18caca52f6724ed Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Apr 2024 10:56:25 -0700
Subject: [PATCH 058/108] Update docs/sphinx/user_guide/feature/policies.rst

Co-authored-by: Rich Hornung <hornung1@llnl.gov>
---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index b3ac763cc9..aad065cb16 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -538,7 +538,7 @@ GPU Policies for SYCL
 	  CUDA and HIP employ a x/y/z ordering in which
 	  dimension x has the unit stride.
 
-	  When using RAJA launch thread and team configuration
+	  When using RAJA::launch, thread and team configuration
 	  follows CUDA and HIP programming models and is always
 	  configured in three-dimensions. This means that dimension
 	  2 always exists and should be used as one would use the

From 2b0864a40053b338edc02f118bcf65c745bfde46 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 1 Apr 2024 10:26:55 -0700
Subject: [PATCH 059/108] Add options to cuda/hip reduction policies

Add replication and atomic_stride to cuda/hip reduction policies.
They currently default to 0 which lets RAJA choose values for these
parameters automatically.
---
 include/RAJA/policy/cuda/policy.hpp |   4 +-
 include/RAJA/policy/cuda/reduce.hpp | 236 +++++++++++++++++-----------
 include/RAJA/policy/hip/policy.hpp  |   4 +-
 include/RAJA/policy/hip/reduce.hpp  | 234 ++++++++++++++++-----------
 4 files changed, 285 insertions(+), 193 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index e7a72b2be7..c9efc45566 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -238,7 +238,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic>
+template <bool maybe_atomic, size_t replication=named_usage::unspecified,
+                             size_t atomic_stride=named_usage::unspecified>
 struct cuda_reduce_base
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
@@ -317,6 +318,7 @@ struct cuda_thread_masked_loop {};
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
+constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 32;
 constexpr const RAJA::Index_type WARP_SIZE = 32;
 constexpr const RAJA::Index_type MAX_BLOCK_SIZE = 1024;
 constexpr const RAJA::Index_type MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 115f652e11..22fd7348a0 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -475,43 +475,50 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
-                                         T identity,
-                                         TempIterator device_mem,
-                                         unsigned int* device_count)
+template <typename Combiner, int replication, int atomic_stride,
+          typename T, typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
+                                        T identity,
+                                        TempIterator device_mem,
+                                        unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int wrap_around = (numBlocks / replication) -
+      ((replicationId < (numBlocks % replication)) ? 0 : 1);
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block writes to device_mem
-  bool lastBlock = false;
+  bool isLastBlock = false;
   if (threadId == 0) {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
+    isLastBlock = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
-  lastBlock = __syncthreads_or(lastBlock);
+  isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (isLastBlock) {
     temp = identity;
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
+    for (int i = replicationId + threadId*replication;
+             i < numBlocks;
+             i += numThreads*replication) {
       Combiner{}(temp, device_mem.get(i));
     }
 
@@ -523,7 +530,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
     }
   }
 
-  return lastBlock && threadId == 0;
+  return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
 namespace expt {
@@ -653,64 +660,71 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val,
-                                                T identity,
-                                                T* device_mem,
-                                                unsigned int* device_count)
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  unsigned int wrap_around = numBlocks + 1;
-
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  // one thread in first block initializes device_mem
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int wrap_around = numBlocks / replication +
+      ((replicationId < (numBlocks % replication)) ? 2 : 1);
+
+  // the first block of each replication initializes device_mem
   if (threadId == 0) {
-    unsigned int old_val = ::atomicCAS(device_count, 0u, 1u);
+    unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[0] = identity;
+      device_mem[atomicOffset] = identity; // consider making this atomic
       __threadfence();
-      ::atomicAdd(device_count, 1u);
+      ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  // one thread per block performs atomic on device_mem
-  bool lastBlock = false;
+  // one thread per block performs an atomic on device_mem
+  bool isLastBlock = false;
   if (threadId == 0) {
-    // thread waits for device_mem to be initialized
-    while (static_cast<volatile unsigned int*>(device_count)[0] < 2u)
+    // wait for device_mem to be initialized
+    while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     __threadfence();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[0], temp);
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
+    isLastBlock = (old_count == wrap_around);
 
-    // last block gets value from device_mem
-    if (lastBlock) {
-      val = device_mem[0];
+    // the last block for each replication gets the value from device_mem
+    if (isLastBlock) {
+      val = device_mem[atomicOffset]; // consider making this atomic
     }
   }
 
-  return lastBlock;
+  return isLastBlock ? replicationId : replication;
 }
 
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T>
+template <typename T, size_t replication>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T value;
+    T values[replication];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -785,7 +799,7 @@ class PinnedTally
       return ret;
     }
 
-    T& operator*() { return m_n->value; }
+    auto operator*() -> T(&)[replication] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -822,7 +836,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  T* new_value(::RAJA::resources::Cuda res)
+  auto new_value(::RAJA::resources::Cuda res) -> T(&)[replication]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -842,7 +856,7 @@ class PinnedTally
     Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
-    return &n->value;
+    return n->values;
   }
 
   //! synchronize all resources used
@@ -889,7 +903,8 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct Reduce_Data {
 
   mutable T value;
@@ -898,7 +913,7 @@ struct Reduce_Data {
   RAJA::detail::SoAPtr<T, device_mempool_type> device;
   bool own_device_ptr;
 
-  Reduce_Data() : Reduce_Data(T(), T()){};
+  Reduce_Data() : Reduce_Data(T(), T()){}
 
   /*! \brief create from a default value and offload information
    *
@@ -928,7 +943,13 @@ struct Reduce_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[replication])
+  {
+    for (size_t r = 0; r < replication; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -936,8 +957,10 @@ struct Reduce_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce<Combiner>(temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce<Combiner, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -949,9 +972,10 @@ struct Reduce_Data {
     if (act) {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
+      size_t numSlots = ((numBlocks + replication - 1) / replication) * replication;
+      device.allocate(numSlots);
       device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -974,7 +998,8 @@ struct Reduce_Data {
 
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomic_Data {
 
   mutable T value;
@@ -1008,7 +1033,13 @@ struct ReduceAtomic_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[replication])
+  {
+    for (size_t r = 0; r < replication; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -1016,9 +1047,10 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce_atomic<Combiner>(
-            temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_atomic<Combiner, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -1028,9 +1060,9 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
+      device = device_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
       device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -1053,7 +1085,8 @@ struct ReduceAtomic_Data {
 };
 
 //! Cuda Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, bool maybe_atomic>
+template <typename Combiner, typename T,
+          bool maybe_atomic, size_t t_replication, size_t t_atomic_stride>
 class Reduce
 {
 public:
@@ -1063,7 +1096,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T>},
+        tally_or_val_ptr{new PinnedTally<T, replication>},
         val(init_val, identity_)
   {
   }
@@ -1090,9 +1123,8 @@ class Reduce
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (parent) {
       if (val.setupForDevice()) {
-        tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(currentResource());
-        val.init_grid_val(tally_or_val_ptr.val_ptr);
+        tally_or_val_ptr.val_ptr = val.init_grid_vals(
+            tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
       }
     }
@@ -1137,7 +1169,10 @@ class Reduce
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       for (; n != end; ++n) {
-        Combiner{}(val.value, *n);
+        T(&values)[replication] = *n;
+        for (size_t r = 0; r < replication; ++r) {
+          Combiner{}(val.value, values[r]);
+        }
       }
       tally_or_val_ptr.list->free_list();
     }
@@ -1160,12 +1195,21 @@ class Reduce
 private:
   const Reduce* parent;
 
+  static constexpr size_t replication = (t_replication > 0)
+      ? t_replication
+      : 1;
+  static constexpr size_t atomic_stride = (t_atomic_stride > 0)
+      ? t_atomic_stride
+      : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
   //! union to hold either pointer to PinnedTally or poiter to value
   //  only use list before setup for device and only use val_ptr after
   union tally_u {
-    PinnedTally<T>* list;
+    PinnedTally<T, replication>* list;
     T* val_ptr;
-    constexpr tally_u(PinnedTally<T>* l) : list(l){};
+    constexpr tally_u(PinnedTally<T, replication>* l) : list(l){};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
 
@@ -1174,8 +1218,8 @@ class Reduce
   //! cuda reduction data storage class and folding algorithm
   using reduce_data_type = typename std::conditional<
       maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available<T>::value,
-      cuda::ReduceAtomic_Data<Combiner, T>,
-      cuda::Reduce_Data<Combiner, T>>::type;
+      cuda::ReduceAtomic_Data<Combiner, T, replication, atomic_stride>,
+      cuda::Reduce_Data<Combiner, T, replication, atomic_stride>>::type;
 
   //! storage for reduction data
   reduce_data_type val;
@@ -1184,13 +1228,13 @@ class Reduce
 }  // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceSum<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceSum<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1202,13 +1246,13 @@ class ReduceSum<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitOr for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitOr<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceBitOr<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1220,13 +1264,13 @@ class ReduceBitOr<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitAnd for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitAnd<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceBitAnd<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1238,13 +1282,13 @@ class ReduceBitAnd<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMin for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMin<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceMin<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1256,13 +1300,13 @@ class ReduceMin<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMax for cuda_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMax<cuda_reduce_base<maybe_atomic>, T>
-    : public cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceMax<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>;
+  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1274,18 +1318,18 @@ class ReduceMax<cuda_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMinLoc for cuda_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMinLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
+class ReduceMinLoc<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
     : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic>
+                          maybe_atomic, replication, atomic_stride>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1324,18 +1368,18 @@ class ReduceMinLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for cuda_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMaxLoc<cuda_reduce_base<maybe_atomic>, T, IndexType>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
+class ReduceMaxLoc<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
     : public cuda::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic>
+                 maybe_atomic, replication, atomic_stride>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 65c87ff203..c814bec83d 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -229,7 +229,8 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic>
+template <bool maybe_atomic, size_t replication=named_usage::unspecified,
+                             size_t atomic_stride=named_usage::unspecified>
 struct hip_reduce_base
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
@@ -308,6 +309,7 @@ struct hip_thread_masked_loop {};
 // Operations in the included files are parametrized using the following
 // values for HIP warp size and max block size.
 //
+constexpr const RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE = 64; // 128 on gfx90a
 #if defined(__HIP_PLATFORM_AMD__)
 constexpr const RAJA::Index_type WARP_SIZE = 64;
 #elif defined(__HIP_PLATFORM_NVIDIA__)
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index df47616cb6..4889c7f598 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -348,43 +348,50 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
-                                         T identity,
-                                         TempIterator device_mem,
-                                         unsigned int* device_count)
+template <typename Combiner, int replication, int atomic_stride,
+          typename T, typename TempIterator>
+RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
+                                        T identity,
+                                        TempIterator device_mem,
+                                        unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
-  unsigned int wrap_around = numBlocks - 1;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int wrap_around = (numBlocks / replication) -
+      ((replicationId < (numBlocks % replication)) ? 0 : 1);
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block writes to device_mem
-  __shared__ bool lastBlock;
+  __shared__ bool isLastBlock;
   if (threadId == 0) {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around) ? 1: 0;
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
+    isLastBlock = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (isLastBlock) {
     temp = identity;
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
+    for (int i = replicationId + threadId*replication;
+             i < numBlocks;
+             i += numThreads*replication) {
       Combiner{}(temp, device_mem.get(i));
     }
 
@@ -396,7 +403,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val,
     }
   }
 
-  return lastBlock && threadId == 0;
+  return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
 namespace expt {
@@ -526,64 +533,71 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val,
-                                                T identity,
-                                                T* device_mem,
-                                                unsigned int* device_count)
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
+                                               T identity,
+                                               T* device_mem,
+                                               unsigned int* device_count)
 {
-  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
-  unsigned int wrap_around = numBlocks + 1;
-
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  // one thread in first block initializes device_mem
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  unsigned int wrap_around = numBlocks / replication +
+      ((replicationId < (numBlocks % replication)) ? 2 : 1);
+
+  // the first block of each replication initializes device_mem
   if (threadId == 0) {
-    unsigned int old_val = ::atomicCAS(device_count, 0u, 1u);
+    unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[0] = identity;
+      device_mem[atomicOffset] = identity; // consider making this atomic
       __threadfence();
-      ::atomicAdd(device_count, 1u);
+      ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  // one thread per block performs atomic on device_mem
-  bool lastBlock = false;
+  // one thread per block performs an atomic on device_mem
+  bool isLastBlock = false;
   if (threadId == 0) {
-    // thread waits for device_mem to be initialized
-    while (static_cast<volatile unsigned int*>(device_count)[0] < 2u)
+    // wait for device_mem to be initialized
+    while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     __threadfence();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[0], temp);
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
+    isLastBlock = (old_count == wrap_around);
 
-    // last block gets value from device_mem
-    if (lastBlock) {
-      val = device_mem[0];
+    // the last block for each replication gets the value from device_mem
+    if (isLastBlock) {
+      val = device_mem[atomicOffset]; // consider making this atomic
     }
   }
 
-  return lastBlock;
+  return isLastBlock ? replicationId : replication;
 }
 
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T>
+template <typename T, size_t replication>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T value;
+    T values[replication];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -658,7 +672,7 @@ class PinnedTally
       return ret;
     }
 
-    T& operator*() { return m_n->value; }
+    auto operator*() -> T(&)[replication] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -695,7 +709,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  T* new_value(::RAJA::resources::Hip res)
+  auto new_value(::RAJA::resources::Hip res) -> T(&)[replication]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -715,7 +729,7 @@ class PinnedTally
     Node* n = hip::pinned_mempool_type::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
-    return &n->value;
+    return n->values;
   }
 
   //! synchronize all resources used
@@ -762,7 +776,8 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct Reduce_Data {
 
   mutable T value;
@@ -801,7 +816,13 @@ struct Reduce_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[replication])
+  {
+    for (size_t r = 0; r < replication; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -809,8 +830,10 @@ struct Reduce_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce<Combiner>(temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce<Combiner, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -822,9 +845,10 @@ struct Reduce_Data {
     if (act) {
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      device.allocate(numBlocks);
+      size_t numSlots = ((numBlocks + replication - 1) / replication) * replication;
+      device.allocate(numSlots);
       device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -847,7 +871,8 @@ struct Reduce_Data {
 
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T>
+template <typename Combiner, typename T,
+          size_t replication, size_t atomic_stride>
 struct ReduceAtomic_Data {
 
   mutable T value;
@@ -856,7 +881,7 @@ struct ReduceAtomic_Data {
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){};
+  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}
 
   ReduceAtomic_Data(T initValue, T identity_)
       : value{initValue},
@@ -881,7 +906,13 @@ struct ReduceAtomic_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  void init_grid_val(T* output) { *output = identity; }
+  T* init_grid_vals(T(&output)[replication])
+  {
+    for (size_t r = 0; r < replication; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
 
   //! reduce values in grid to single value, store in output
   RAJA_DEVICE
@@ -889,9 +920,10 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    if (impl::grid_reduce_atomic<Combiner>(
-            temp, identity, device, device_count)) {
-      *output = temp;
+    size_t replicationId = impl::grid_reduce_atomic<Combiner, replication, atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication) {
+      output[replicationId] = temp;
     }
   }
 
@@ -901,9 +933,9 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(1);
+      device = device_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
       device_count = device_zeroed_mempool_type::getInstance()
-                         .template malloc<unsigned int>(1);
+                         .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -926,7 +958,8 @@ struct ReduceAtomic_Data {
 };
 
 //! Hip Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, bool maybe_atomic>
+template <typename Combiner, typename T,
+          bool maybe_atomic, size_t t_replication, size_t t_atomic_stride>
 class Reduce
 {
 public:
@@ -936,7 +969,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T>},
+        tally_or_val_ptr{new PinnedTally<T, replication>},
         val(init_val, identity_)
   {
   }
@@ -963,9 +996,8 @@ class Reduce
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     if (parent) {
       if (val.setupForDevice()) {
-        tally_or_val_ptr.val_ptr =
-            tally_or_val_ptr.list->new_value(currentResource());
-        val.init_grid_val(tally_or_val_ptr.val_ptr);
+        tally_or_val_ptr.val_ptr = val.init_grid_vals(
+            tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
       }
     }
@@ -1010,7 +1042,10 @@ class Reduce
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       for (; n != end; ++n) {
-        Combiner{}(val.value, *n);
+        T(&values)[replication] = *n;
+        for (size_t r = 0; r < replication; ++r) {
+          Combiner{}(val.value, values[r]);
+        }
       }
       tally_or_val_ptr.list->free_list();
     }
@@ -1033,12 +1068,21 @@ class Reduce
 private:
   const Reduce* parent;
 
+  static constexpr size_t replication = (t_replication > 0)
+      ? t_replication
+      : 32;
+  static constexpr size_t atomic_stride = (t_atomic_stride > 0)
+      ? t_atomic_stride
+      : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
   //! union to hold either pointer to PinnedTally or poiter to value
   //  only use list before setup for device and only use val_ptr after
   union tally_u {
-    PinnedTally<T>* list;
+    PinnedTally<T, replication>* list;
     T* val_ptr;
-    constexpr tally_u(PinnedTally<T>* l) : list(l){};
+    constexpr tally_u(PinnedTally<T, replication>* l) : list(l){};
     constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
   };
 
@@ -1047,8 +1091,8 @@ class Reduce
   //! hip reduction data storage class and folding algorithm
   using reduce_data_type = typename std::conditional<
       maybe_atomic && RAJA::reduce::hip::hip_atomic_available<T>::value,
-      hip::ReduceAtomic_Data<Combiner, T>,
-      hip::Reduce_Data<Combiner, T>>::type;
+      hip::ReduceAtomic_Data<Combiner, T, replication, atomic_stride>,
+      hip::Reduce_Data<Combiner, T, replication, atomic_stride>>::type;
 
   //! storage for reduction data
   reduce_data_type val;
@@ -1057,13 +1101,13 @@ class Reduce
 }  // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceSum<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceSum<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1075,13 +1119,13 @@ class ReduceSum<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitOr for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitOr<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceBitOr<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1093,13 +1137,13 @@ class ReduceBitOr<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceBitAnd for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceBitAnd<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceBitAnd<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1111,13 +1155,13 @@ class ReduceBitAnd<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMin for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMin<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceMin<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1129,13 +1173,13 @@ class ReduceMin<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMax for hip_reduce
-template <bool maybe_atomic, typename T>
-class ReduceMax<hip_reduce_base<maybe_atomic>, T>
-    : public hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
+class ReduceMax<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
+    : public hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic>;
+  using Base = hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1147,18 +1191,18 @@ class ReduceMax<hip_reduce_base<maybe_atomic>, T>
 };
 
 //! specialization of ReduceMinLoc for hip_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMinLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
+class ReduceMinLoc<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
     : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic>
+                          maybe_atomic, replication, atomic_stride>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = hip::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1197,18 +1241,18 @@ class ReduceMinLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for hip_reduce
-template <bool maybe_atomic, typename T, typename IndexType>
-class ReduceMaxLoc<hip_reduce_base<maybe_atomic>, T, IndexType>
+template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
+class ReduceMaxLoc<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
     : public hip::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic>
+                 maybe_atomic, replication, atomic_stride>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic>;
+  using Base = hip::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer

From 18d29cef93f02bd4b02e58c1ddbb646d8af9a2c6 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 1 Apr 2024 11:43:18 -0700
Subject: [PATCH 060/108] Reorder non-atomic grid reduce device storage

This makes the final block for each replication have coalesced
reads as it combines the slots
---
 include/RAJA/policy/cuda/reduce.hpp | 33 +++++++++++++++++------------
 include/RAJA/policy/hip/reduce.hpp  | 33 +++++++++++++++++------------
 2 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 22fd7348a0..0dcb1ee3ae 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -490,23 +490,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = blockId % replication;
+  int slotId = blockId / replication;
+
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  unsigned int wrap_around = (numBlocks / replication) -
-      ((replicationId < (numBlocks % replication)) ? 0 : 1);
+  int atomicOffset = replicationId * atomic_stride;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
   if (threadId == 0) {
-    device_mem.set(blockId, temp);
+    device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
-    isLastBlock = (old_count == wrap_around);
+    // increment counter, (wraps back to zero if old count == (numSlots-1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -516,10 +521,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   if (isLastBlock) {
     temp = identity;
 
-    for (int i = replicationId + threadId*replication;
-             i < numBlocks;
-             i += numThreads*replication) {
-      Combiner{}(temp, device_mem.get(i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -972,8 +977,8 @@ struct Reduce_Data {
     if (act) {
       cuda_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      size_t numSlots = ((numBlocks + replication - 1) / replication) * replication;
-      device.allocate(numSlots);
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      device.allocate(maxNumSlots*replication);
       device_count = device_zeroed_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 4889c7f598..fd79e67600 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -363,23 +363,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = blockId % replication;
+  int slotId = blockId / replication;
+
+  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  unsigned int wrap_around = (numBlocks / replication) -
-      ((replicationId < (numBlocks % replication)) ? 0 : 1);
+  int atomicOffset = replicationId * atomic_stride;
+  int beginSlots = replicationId * maxNumSlots;
+  int blockSlot = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
   if (threadId == 0) {
-    device_mem.set(blockId, temp);
+    device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
-    isLastBlock = (old_count == wrap_around);
+    // increment counter, (wraps back to zero if old count == (numSlots-1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
+    isLastBlock = (old_count == (numSlots-1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
@@ -389,10 +394,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   if (isLastBlock) {
     temp = identity;
 
-    for (int i = replicationId + threadId*replication;
-             i < numBlocks;
-             i += numThreads*replication) {
-      Combiner{}(temp, device_mem.get(i));
+    for (unsigned int i = threadId;
+                      i < numSlots;
+                      i += numThreads) {
+      Combiner{}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
@@ -845,8 +850,8 @@ struct Reduce_Data {
     if (act) {
       hip_dim_t gridDim = currentGridDim();
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
-      size_t numSlots = ((numBlocks + replication - 1) / replication) * replication;
-      device.allocate(numSlots);
+      size_t maxNumSlots = (numBlocks + replication - 1) / replication;
+      device.allocate(maxNumSlots*replication);
       device_count = device_zeroed_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;

From 76b24505fa5f6f028047435e318420c0c9b01f7d Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 1 Apr 2024 13:02:07 -0700
Subject: [PATCH 061/108] Add special case for small numbers of blocks

Now the algorithm avoids atomics and extra block reductions
if they are unnecessary
---
 include/RAJA/policy/cuda/reduce.hpp | 25 ++++++++++++++++++++-----
 include/RAJA/policy/hip/reduce.hpp  | 25 ++++++++++++++++++++-----
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 0dcb1ee3ae..3704eb4303 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -503,6 +503,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
 
   T temp = block_reduce<Combiner>(val, identity);
 
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
+
   // one thread per block writes to device_mem
   bool isLastBlock = false;
   if (threadId == 0) {
@@ -681,8 +688,16 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
   int replicationId = (blockId%replication);
   int atomicOffset = replicationId*atomic_stride;
 
-  unsigned int wrap_around = numBlocks / replication +
-      ((replicationId < (numBlocks % replication)) ? 2 : 1);
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+
+  if (numSlots <= 1u) {
+    T temp = block_reduce<Combiner>(val, identity);
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
 
   // the first block of each replication initializes device_mem
   if (threadId == 0) {
@@ -705,9 +720,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
     __threadfence();
     RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
-    isLastBlock = (old_count == wrap_around);
+    // increment counter, (wraps back to zero if old count == (numSlots+1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index fd79e67600..187de47ee2 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -376,6 +376,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
 
   T temp = block_reduce<Combiner>(val, identity);
 
+  if (numSlots <= 1u) {
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
+
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
   if (threadId == 0) {
@@ -554,8 +561,16 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
   int replicationId = (blockId%replication);
   int atomicOffset = replicationId*atomic_stride;
 
-  unsigned int wrap_around = numBlocks / replication +
-      ((replicationId < (numBlocks % replication)) ? 2 : 1);
+  unsigned int numSlots = (numBlocks / replication) +
+      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+
+  if (numSlots <= 1u) {
+    T temp = block_reduce<Combiner>(val, identity);
+    if (threadId == 0) {
+      val = temp;
+    }
+    return (threadId == 0) ? replicationId : replication;
+  }
 
   // the first block of each replication initializes device_mem
   if (threadId == 0) {
@@ -578,9 +593,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
     __threadfence();
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
     __threadfence();
-    // increment counter, (wraps back to zero if old count == wrap_around)
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around);
-    isLastBlock = (old_count == wrap_around);
+    // increment counter, (wraps back to zero if old count == (numSlots+1))
+    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
+    isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {

From ae50f0895d3dcbfa29dd15a02e98502f278804c1 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sat, 6 Apr 2024 23:57:37 -0700
Subject: [PATCH 062/108] Add device pinned allocators

These are useful for operations that need to access memory on the host
and on the device, but the device accesses are more performance
critical.
---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 25 ++++++++++++++++++++++
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 20 +++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 4e85f948e8..5a66aff20e 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -111,9 +111,34 @@ struct DeviceZeroedAllocator {
   }
 };
 
+//! Allocator for device pinned memory for use in basic_mempool
+struct DevicePinnedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    int device;
+    cudaErrchk(cudaGetDevice(&device));
+    void* ptr;
+    cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+
+    return ptr;
+  }
+
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    cudaErrchk(cudaFree(ptr));
+    return true;
+  }
+};
+
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 82b7bfc633..63a8c9911c 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -113,9 +113,29 @@ struct DeviceZeroedAllocator {
   }
 };
 
+//! Allocator for device pinned memory for use in basic_mempool
+struct DevicePinnedAllocator {
+
+  // returns a valid pointer on success, nullptr on failure
+  void* malloc(size_t nbytes)
+  {
+    void* ptr;
+    hipErrchk(hipMalloc(&ptr, nbytes));
+    return ptr;
+  }
+
+  // returns true on success, false on failure
+  bool free(void* ptr)
+  {
+    hipErrchk(hipFree(ptr));
+    return true;
+  }
+};
+
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
+using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail

From e90045709f4a6f1f54f8130e27d0b5cb9a9471ff Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sat, 6 Apr 2024 23:59:02 -0700
Subject: [PATCH 063/108] Add an accessor template arg to SoAPtr

---
 include/RAJA/util/SoAPtr.hpp | 62 ++++++++++++++++++++++++++++++------
 include/RAJA/util/types.hpp  | 26 +++++++++++++++
 2 files changed, 78 insertions(+), 10 deletions(-)

diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 616b8d21d4..00a2fce111 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -20,8 +20,11 @@
 
 #include "RAJA/config.hpp"
 
+#include <type_traits>
+
 // for RAJA::reduce::detail::ValueLoc
 #include "RAJA/pattern/detail/reduce.hpp"
+#include "RAJA/util/types.hpp"
 
 namespace RAJA
 {
@@ -38,18 +41,37 @@ namespace detail
  */
 template <typename T,
           typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator> >
+              RAJA::basic_mempool::generic_allocator>,
+          typename accessor = DefaultAccessor >
 class SoAPtr
 {
-  using value_type = T;
+  template < typename, typename, typename >
+  friend class SoAPtr; // fiend other instantiations of this class
 
 public:
+  using value_type = T;
+
+  template < typename rhs_accessor >
+  using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
+
   SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr const&) = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
+
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
   {
   }
 
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+  { }
+
   SoAPtr& allocate(size_t size)
   {
     mem = mempool::getInstance().template malloc<value_type>(size);
@@ -65,8 +87,8 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
+  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
+  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
 
 private:
   value_type* mem = nullptr;
@@ -75,21 +97,41 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
+template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
 {
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
   using first_type = T;
   using second_type = IndexType;
 
+  template < typename, typename, typename >
+  friend class SoAPtr; // fiend other instantiations of this class
+
 public:
+  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+
+  template < typename rhs_accessor >
+  using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
+
   SoAPtr() = default;
+  SoAPtr(SoAPtr const&) = default;
+  SoAPtr(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr const&) = default;
+  SoAPtr& operator=(SoAPtr &&) = default;
+
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
   {
   }
 
+  template < typename rhs_accessor,
+             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
+  RAJA_HOST_DEVICE
+  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+    : mem(rhs.mem)
+    , mem_idx(rhs.mem_idx)
+  { }
+
   SoAPtr& allocate(size_t size)
   {
     mem = mempool::getInstance().template malloc<first_type>(size);
@@ -110,12 +152,12 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool>
 
   RAJA_HOST_DEVICE value_type get(size_t i) const
   {
-    return value_type(mem[i], mem_idx[i]);
+    return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
-    mem_idx[i] = val.getLoc();
+    accessor::set(mem, i, first_type(val));
+    accessor::set(mem_idx, i, val.getLoc());
   }
 
 private:
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 03cd3b3deb..f19d9947b6 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -30,6 +30,9 @@
 
 #include "camp/helpers.hpp"
 
+#include "RAJA/util/macros.hpp"
+
+
 namespace RAJA
 {
 
@@ -863,6 +866,29 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #endif
 
+
+namespace detail {
+
+/*!
+ * \brief Abstracts access to memory using normal memory accesses.
+ */
+struct DefaultAccessor
+{
+  template < typename T >
+  static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
+  {
+    return ptr[i];
+  }
+
+  template < typename T >
+  static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
+  {
+    ptr[i] = val;
+  }
+};
+
+}  // namespace detail
+
 }  // namespace RAJA
 
 #endif  // closing endif for header file include guard

From 03734897f8de7d8fd60ecda9a51181bf66235e93 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 7 Apr 2024 00:05:45 -0700
Subject: [PATCH 064/108] Add more cuda/hip reducer tunings

Add option to initalize reducers with atomics on the host.
Add option to to use algorithm that avoids device scope fences.
Split cuda/hip reduce header into reduce and intrinsics headers.
---
 docs/Licenses/rocprim-license.txt       |  21 +
 include/RAJA/policy/cuda/intrinsics.hpp | 452 +++++++++++++++
 include/RAJA/policy/cuda/policy.hpp     |  44 +-
 include/RAJA/policy/cuda/reduce.hpp     | 736 +++++++++---------------
 include/RAJA/policy/hip/intrinsics.hpp  | 346 +++++++++++
 include/RAJA/policy/hip/policy.hpp      |  49 +-
 include/RAJA/policy/hip/reduce.hpp      | 589 ++++++++-----------
 include/RAJA/util/macros.hpp            |   2 +
 include/RAJA/util/types.hpp             |  69 +++
 test/include/RAJA_test-reducepol.hpp    |  12 +-
 10 files changed, 1487 insertions(+), 833 deletions(-)
 create mode 100644 docs/Licenses/rocprim-license.txt
 create mode 100644 include/RAJA/policy/cuda/intrinsics.hpp
 create mode 100644 include/RAJA/policy/hip/intrinsics.hpp

diff --git a/docs/Licenses/rocprim-license.txt b/docs/Licenses/rocprim-license.txt
new file mode 100644
index 0000000000..976ca2abb3
--- /dev/null
+++ b/docs/Licenses/rocprim-license.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
new file mode 100644
index 0000000000..053d7ab50e
--- /dev/null
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -0,0 +1,452 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA intrinsics templates for CUDA execution.
+ *
+ *          These methods should work on any platform that supports
+ *          CUDA devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_cuda_intrinsics_HPP
+#define RAJA_cuda_intrinsics_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_CUDA)
+
+#include <type_traits>
+
+#include <cuda.h>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/cuda/policy.hpp"
+
+
+namespace RAJA
+{
+
+namespace cuda
+{
+
+namespace impl
+{
+
+/*!
+ * \brief Abstracts access to memory using normal memory accesses.
+ */
+struct AccessorWithFences : RAJA::detail::DefaultAccessor
+{
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Abstracts access to memory using atomic memory accesses.
+ *
+ * \Note Memory access through this class does not guarantee safe access to a
+ *       value that is accessed concurrently by other threads as it may split
+ *       memory operations into multiple atomic instructions.
+ * \Note Fences used through this class only guarantee ordering, they do not
+ *       guarantee visiblity of non-atomic memory operations as it may not
+ *       actually flush the cache.
+ *
+ ******************************************************************************
+ */
+struct AccessorAvoidingFences
+{
+  // cuda has 32 and 64 bit atomics
+  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
+  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+      u.array[i] = atomicAdd(&ptr[i], integer_type(0));
+    }
+
+    return u.get_value();
+  }
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    u.set_value(val);
+    auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+      atomicExch(&ptr[i], u.array[i]);
+    }
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+
+// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
+constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned long long);
+#else
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
+#endif
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
+ *
+ * \Note Returns an undefined value if src lane is inactive (divergence).
+ *       Returns this lane's value if src lane is out of bounds or has exited.
+ *
+ ******************************************************************************
+ */
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
+#else
+    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
+#endif
+  }
+  return u.get_value();
+}
+
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+    u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
+#else
+    u.array[i] = ::__shfl(u.array[i], srcLane);
+#endif
+  }
+  return u.get_value();
+}
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
+{
+  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
+}
+
+#else
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+#endif
+
+
+#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
+{
+  return ::__shfl_sync(0xffffffffu, var, srcLane);
+}
+
+#else
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+#endif
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = val;
+
+  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  return temp;
+}
+
+/*!
+ * Allreduce values in a warp.
+ *
+ *
+ * This does a butterfly pattern leaving each lane with the full reduction
+ *
+ */
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
+{
+  T temp = val;
+
+  for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+    T rhs = __shfl_xor_sync(0xffffffff, temp, i);
+    Combiner{}(temp, rhs);
+  }
+
+  return temp;
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % policy::cuda::WARP_SIZE;
+  int warpNum = threadId / policy::cuda::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % policy::cuda::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  // reduce per warp values
+  if (numThreads > policy::cuda::WARP_SIZE) {
+
+    static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE,
+        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+    // Need to separate declaration and initialization for clang-cuda
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>)];
+
+    // Partial placement new: Should call new(tmpsd) here but recasting memory
+    // to avoid calling constructor/destructor in shared memory.
+    RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * policy::cuda::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) {
+        T rhs = shfl_xor_sync(temp, i);
+        Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+}  // end namespace impl
+
+}  // end namespace cuda
+
+}  // end namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_CUDA guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index c9efc45566..b3b8ae04d1 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -159,6 +159,17 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   }
 };
 
+template < size_t t_replication, size_t t_atomic_stride,
+           bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host >
+struct ReduceTuning
+{
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr bool maybe_atomic = t_maybe_atomic;
+  static constexpr bool avoid_fences = t_avoid_fences;
+  static constexpr bool init_on_host = t_init_on_host;
+};
+
 }  // namespace cuda
 
 namespace policy
@@ -238,9 +249,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic, size_t replication=named_usage::unspecified,
-                             size_t atomic_stride=named_usage::unspecified>
-struct cuda_reduce_base
+template < typename tuning >
+struct cuda_reduce_policy
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
                                                 RAJA::Pattern::reduce,
@@ -261,9 +271,28 @@ struct cuda_atomic_explicit{};
  */
 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
-using cuda_reduce = cuda_reduce_base<false>;
+template < bool maybe_atomic,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified,
+           bool init_on_host = false,
+           bool avoid_fences = false >
+using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
+    replication, atomic_stride,
+    maybe_atomic, init_on_host, avoid_fences> >;
+
+using cuda_reduce_with_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
+
+using cuda_reduce_avoid_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
+
+using cuda_reduce_atomic_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
+
+using cuda_reduce_atomic_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
+
+using cuda_reduce_atomic_host_init = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+
+using cuda_reduce = cuda_reduce_with_fences;
 
-using cuda_reduce_atomic = cuda_reduce_base<true>;
+using cuda_reduce_atomic = cuda_reduce_atomic_host_init;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
@@ -1142,6 +1171,11 @@ using policy::cuda::cuda_atomic;
 using policy::cuda::cuda_atomic_explicit;
 
 // policies usable with reducers
+using policy::cuda::cuda_reduce_with_fences;
+using policy::cuda::cuda_reduce_avoid_fences;
+using policy::cuda::cuda_reduce_atomic_with_fences;
+using policy::cuda::cuda_reduce_atomic_avoid_fences;
+using policy::cuda::cuda_reduce_atomic_host_init;
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 3704eb4303..ccb310d2f9 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -25,6 +25,8 @@
 
 #if defined(RAJA_ENABLE_CUDA)
 
+#include <type_traits>
+
 #include <cuda.h>
 
 #include "RAJA/util/macros.hpp"
@@ -38,6 +40,7 @@
 #include "RAJA/pattern/reduce.hpp"
 
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
+#include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
   #include "RAJA/policy/desul/atomic.hpp"
@@ -56,6 +59,7 @@ namespace reduce
 
 namespace cuda
 {
+
 //! atomic operator version of Combiner object
 template <typename Combiner>
 struct atomic;
@@ -84,6 +88,22 @@ struct atomic<max<T>> {
   }
 };
 
+template <typename T>
+struct atomic<and_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicAnd<T>(RAJA::cuda_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<or_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicOr<T>(RAJA::cuda_atomic{}, &val, v);
+  }
+};
+
 template <typename T>
 struct cuda_atomic_available {
   static constexpr const bool value =
@@ -101,387 +121,18 @@ namespace cuda
 namespace impl
 {
 
-/*!
- * \brief Abstracts T into an equal or greater size array of integers whose
- * size is between min_integer_type_size and max_interger_type_size inclusive.
- */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(long long)>
-union AsIntegerArray {
-
-  static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible min and max integer type size");
-  using integer_type = typename std::conditional<
-      ((alignof(T) >= alignof(long long) &&
-        sizeof(long long) <= max_integer_type_size) ||
-       sizeof(long) < min_integer_type_size),
-      long long,
-      typename std::conditional<
-          ((alignof(T) >= alignof(long) &&
-            sizeof(long) <= max_integer_type_size) ||
-           sizeof(int) < min_integer_type_size),
-          long,
-          typename std::conditional<
-              ((alignof(T) >= alignof(int) &&
-                sizeof(int) <= max_integer_type_size) ||
-               sizeof(short) < min_integer_type_size),
-              int,
-              typename std::conditional<
-                  ((alignof(T) >= alignof(short) &&
-                    sizeof(short) <= max_integer_type_size) ||
-                   sizeof(char) < min_integer_type_size),
-                  short,
-                  typename std::conditional<
-                      ((alignof(T) >= alignof(char) &&
-                        sizeof(char) <= max_integer_type_size)),
-                      char,
-                      void>::type>::type>::type>::type>::type;
-  static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a compatible integer type");
-  static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type smaller than min integer type size");
-  static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type greater than max integer type size");
-
-  static constexpr size_t num_integer_type =
-      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
-
-  T value;
-  integer_type array[num_integer_type];
-
-  RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){};
-
-  RAJA_HOST_DEVICE constexpr size_t array_size() const
-  {
-    return num_integer_type;
-  }
-};
-
-// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
-constexpr const size_t min_shfl_int_type_size = sizeof(int);
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-constexpr const size_t max_shfl_int_type_size = sizeof(long long);
-#else
-constexpr const size_t max_shfl_int_type_size = sizeof(int);
-#endif
-
-/*!
- ******************************************************************************
- *
- * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
- *
- * \Note Returns an undefined value if src lane is inactive (divergence).
- *       Returns this lane's value if src lane is out of bounds or has exited.
- *
- ******************************************************************************
- */
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-    u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
-#else
-    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
-#endif
-  }
-  return u.value;
-}
-
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-    u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
-#else
-    u.array[i] = ::__shfl(u.array[i], srcLane);
-#endif
-  }
-  return u.value;
-}
-
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
-{
-  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
-}
-
-#else
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-#endif
-
-
-#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
-{
-  return ::__shfl_sync(0xffffffffu, var, srcLane);
-}
-
-#else
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-#endif
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
-{
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = val;
-
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  return temp;
-}
-
-/*!
- * Allreduce values in a warp.
- *
- *
- * This does a butterfly pattern leaving each lane with the full reduction
- *
- */
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
-{
-  T temp = val;
-
-  for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-    T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
-  }
-
-  return temp;
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
-{
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  int warpId = threadId % policy::cuda::WARP_SIZE;
-  int warpNum = threadId / policy::cuda::WARP_SIZE;
-
-  T temp = val;
-
-  if (numThreads % policy::cuda::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  // reduce per warp values
-  if (numThreads > policy::cuda::WARP_SIZE) {
-
-    static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
-
-    // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>)];
-
-    // Partial placement new: Should call new(tmpsd) here but recasting memory
-    // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::MAX_WARPS> *>(tmpsd);
-
-    // write per warp values to shared memory
-    if (warpId == 0) {
-      sd->set(warpNum, temp);
-    }
-
-    __syncthreads();
-
-    if (warpNum == 0) {
-
-      // read per warp values
-      if (warpId * policy::cuda::WARP_SIZE < numThreads) {
-        temp = sd->get(warpId);
-      } else {
-        temp = identity;
-      }
-
-      for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) {
-        T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
-      }
-    }
-
-    __syncthreads();
-  }
-
-  return temp;
-}
-
-
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, int replication, int atomic_stride,
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
           typename T, typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
                                         T identity,
-                                        TempIterator device_mem,
+                                        TempIterator in_device_mem,
                                         unsigned int* device_count)
 {
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -515,7 +166,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   if (threadId == 0) {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
-    __threadfence();
+    Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
     unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
     isLastBlock = (old_count == (numSlots-1));
@@ -527,6 +178,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   // last block accumulates values from device_mem
   if (isLastBlock) {
     temp = identity;
+    Accessor::fence_acquire();
 
     for (unsigned int i = threadId;
                       i < numSlots;
@@ -653,6 +305,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   // last block accumulates values from device_mem
   if (lastBlock) {
     temp = OP::identity();
+    __threadfence();
 
     for (int i = threadId; i < numBlocks; i += numThreads) {
       temp = OP{}(temp, red.device_mem.get(i));
@@ -672,7 +325,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, int replication, int atomic_stride, typename T>
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride, typename T>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
                                                T identity,
                                                T* device_mem,
@@ -703,8 +357,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
   if (threadId == 0) {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[atomicOffset] = identity; // consider making this atomic
-      __threadfence();
+      Accessor::set(device_mem, atomicOffset, identity);
+      Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
@@ -717,34 +371,58 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
-    __threadfence();
+    Accessor::fence_acquire();
     RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
-    __threadfence();
+    Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
     unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
     isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {
-      val = device_mem[atomicOffset]; // consider making this atomic
+      Accessor::fence_acquire();
+      val = Accessor::get(device_mem, atomicOffset);
     }
   }
 
   return isLastBlock ? replicationId : replication;
 }
 
+//! reduce values in block into thread 0 and atomically combines into device_mem
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val,
+                                                            T identity,
+                                                            T* device_mem)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  T temp = block_reduce<Combiner>(val, identity);
+
+  // one thread per block performs an atomic on device_mem
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  }
+}
+
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T, size_t replication>
+template <typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T values[replication];
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -819,7 +497,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[replication] { return m_n->values; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -856,7 +534,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[replication]
+  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -873,7 +551,7 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = cuda::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = mempool::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
     return n->values;
@@ -896,7 +574,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        cuda::pinned_mempool_type::getInstance().free(n);
+        mempool::getInstance().free(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -923,15 +601,21 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T,
+template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct Reduce_Data {
+struct Reduce_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
-  bool own_device_ptr;
+  RAJA::detail::SoAPtr<T, data_mempool_type> device;
+  bool owns_device_pointer;
 
   Reduce_Data() : Reduce_Data(T(), T()){}
 
@@ -945,7 +629,7 @@ struct Reduce_Data {
         identity{identity_},
         device_count{nullptr},
         device{},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
@@ -955,7 +639,7 @@ struct Reduce_Data {
         identity{other.identity},
         device_count{other.device_count},
         device{other.device},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
@@ -963,9 +647,9 @@ struct Reduce_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[replication])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < replication; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -977,8 +661,9 @@ struct Reduce_Data {
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce<Combiner, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -994,9 +679,9 @@ struct Reduce_Data {
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots*replication);
-      device_count = device_zeroed_mempool_type::getInstance()
+      device_count = count_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
-      own_device_ptr = true;
+      owns_device_pointer = true;
     }
     return act;
   }
@@ -1005,28 +690,114 @@ struct Reduce_Data {
   //  free device pointers
   bool teardownForDevice()
   {
-    bool act = own_device_ptr;
+    bool act = owns_device_pointer;
     if (act) {
       device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
-      own_device_ptr = false;
+      owns_device_pointer = false;
     }
     return act;
   }
 };
 
-
 //! Reduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomic_Data {
+struct ReduceAtomicInitialized_Data
+{
+  using tally_mempool_type = device_pinned_mempool_type;
+
+  static constexpr size_t tally_slots = replication * atomic_stride;
+
+  mutable T value;
+  T identity;
+  bool is_setup;
+  bool owns_device_pointer;
+
+  ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){};
+
+  ReduceAtomicInitialized_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        owns_device_pointer{false}
+  {
+  }
+
+  RAJA_HOST_DEVICE
+  ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other)
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        owns_device_pointer{false}
+  {
+  }
+
+  ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default;
+
+  //! initialize output to identity to ensure never read
+  //  uninitialized memory
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
+
+  //! reduce values in grid to single value, store in output
+  RAJA_DEVICE
+  void grid_reduce(T* output)
+  {
+    T temp = value;
+
+    impl::grid_reduce_atomic_initialized<Combiner,
+        replication, atomic_stride>(
+            temp, identity, output);
+  }
+
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
+  {
+    bool act = !is_setup && setupReducers();
+    if (act) {
+      is_setup = true;
+      owns_device_pointer = true;
+    }
+    return act;
+  }
+
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
+  {
+    bool act = owns_device_pointer;
+    if (act) {
+      is_setup = false;
+      owns_device_pointer = false;
+    }
+    return act;
+  }
+};
+
+//! Reduction data for Cuda Offload -- stores value, host pointer
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomic_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
   T* device;
-  bool own_device_ptr;
+  bool owns_device_pointer;
 
   ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){};
 
@@ -1035,7 +806,7 @@ struct ReduceAtomic_Data {
         identity{identity_},
         device_count{nullptr},
         device{nullptr},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
@@ -1045,7 +816,7 @@ struct ReduceAtomic_Data {
         identity{other.identity},
         device_count{other.device_count},
         device{other.device},
-        own_device_ptr{false}
+        owns_device_pointer{false}
   {
   }
 
@@ -1053,9 +824,9 @@ struct ReduceAtomic_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[replication])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < replication; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -1067,8 +838,9 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic<Combiner, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -1080,10 +852,10 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = device_zeroed_mempool_type::getInstance()
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
-      own_device_ptr = true;
+      owns_device_pointer = true;
     }
     return act;
   }
@@ -1092,23 +864,58 @@ struct ReduceAtomic_Data {
   //  free device pointers
   bool teardownForDevice()
   {
-    bool act = own_device_ptr;
+    bool act = owns_device_pointer;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      data_mempool_type::getInstance().free(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
-      own_device_ptr = false;
+      owns_device_pointer = false;
     }
     return act;
   }
 };
 
 //! Cuda Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T,
-          bool maybe_atomic, size_t t_replication, size_t t_atomic_stride>
+template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 1;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  static constexpr bool use_atomic = tuning::maybe_atomic &&
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+
+  using Accessor = std::conditional_t<tuning::avoid_fences,
+      impl::AccessorAvoidingFences,
+      impl::AccessorWithFences>;
+
+  //! cuda reduction data storage class and folding algorithm
+  using reduce_data_type = std::conditional_t<use_atomic,
+      std::conditional_t<tuning::init_on_host,
+        cuda::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+        cuda::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>>,
+      cuda::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>>;
+
+  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
+
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+
+  //! union to hold either pointer to PinnedTally or pointer to value
+  //  only use list before setup for device and only use val_ptr after
+  union tally_u {
+    TallyType* list;
+    T* val_ptr;
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+  };
+
 public:
   Reduce() : Reduce(T(), Combiner::identity()) {}
 
@@ -1116,7 +923,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T, replication>},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
   {
   }
@@ -1189,8 +996,8 @@ class Reduce
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       for (; n != end; ++n) {
-        T(&values)[replication] = *n;
-        for (size_t r = 0; r < replication; ++r) {
+        T(&values)[tally_slots] = *n;
+        for (size_t r = 0; r < tally_slots; ++r) {
           Combiner{}(val.value, values[r]);
         }
       }
@@ -1214,47 +1021,20 @@ class Reduce
 
 private:
   const Reduce* parent;
-
-  static constexpr size_t replication = (t_replication > 0)
-      ? t_replication
-      : 1;
-  static constexpr size_t atomic_stride = (t_atomic_stride > 0)
-      ? t_atomic_stride
-      : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  //! union to hold either pointer to PinnedTally or poiter to value
-  //  only use list before setup for device and only use val_ptr after
-  union tally_u {
-    PinnedTally<T, replication>* list;
-    T* val_ptr;
-    constexpr tally_u(PinnedTally<T, replication>* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
-  };
-
   tally_u tally_or_val_ptr;
-
-  //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = typename std::conditional<
-      maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available<T>::value,
-      cuda::ReduceAtomic_Data<Combiner, T, replication, atomic_stride>,
-      cuda::Reduce_Data<Combiner, T, replication, atomic_stride>>::type;
-
-  //! storage for reduction data
   reduce_data_type val;
 };
 
 }  // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceSum<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1266,13 +1046,13 @@ class ReduceSum<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceBitOr for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceBitOr<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1284,13 +1064,13 @@ class ReduceBitOr<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceBitAnd for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceBitAnd<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1302,13 +1082,13 @@ class ReduceBitAnd<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T
 };
 
 //! specialization of ReduceMin for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceMin<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1320,13 +1100,13 @@ class ReduceMin<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceMax for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceMax<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
+    : public cuda::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
 
 public:
-  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1338,18 +1118,18 @@ class ReduceMax<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceMinLoc for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
-class ReduceMinLoc<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
     : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic, replication, atomic_stride>
+                          tuning>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1388,18 +1168,18 @@ class ReduceMinLoc<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T
 };
 
 //! specialization of ReduceMaxLoc for cuda_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
-class ReduceMaxLoc<cuda_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
     : public cuda::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic, replication, atomic_stride>
+                 tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
+  using Base = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
new file mode 100644
index 0000000000..374a66323e
--- /dev/null
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -0,0 +1,346 @@
+/*!
+ ******************************************************************************
+ *
+ * \file
+ *
+ * \brief   Header file containing RAJA intrinsics templates for HIP execution.
+ *
+ *          These methods should work on any platform that supports
+ *          HIP devices.
+ *
+ ******************************************************************************
+ */
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_hip_intrinsics_HPP
+#define RAJA_hip_intrinsics_HPP
+
+#include "RAJA/config.hpp"
+
+#if defined(RAJA_ENABLE_HIP)
+
+#include <type_traits>
+
+#include <hip/hip_runtime.h>
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/SoAArray.hpp"
+#include "RAJA/util/types.hpp"
+
+#include "RAJA/policy/hip/policy.hpp"
+
+
+namespace RAJA
+{
+
+namespace hip
+{
+
+namespace impl
+{
+
+/*!
+ * \brief Abstracts access to memory using normal memory accesses.
+ */
+struct AccessorWithFences : RAJA::detail::DefaultAccessor
+{
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+    __threadfence();
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+    __threadfence();
+  }
+};
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Abstracts access to memory using atomic memory accesses.
+ *
+ * \Note Memory access through this class does not guarantee safe access to a
+ *       value that is accessed concurrently by other threads as it may split
+ *       memory operations into multiple atomic instructions.
+ * \Note Fences used through this class only guarantee ordering, they do not
+ *       guarantee visiblity of non-atomic memory operations as it may not
+ *       actually flush the cache.
+ *
+ ******************************************************************************
+ */
+struct AccessorAvoidingFences
+{
+  // hip has 32 and 64 bit atomics
+  static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
+  static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+      u.array[i] = atomicAdd(&ptr[i], integer_type(0));
+#endif
+    }
+
+    return u.get_value();
+  }
+
+  template < typename T >
+  static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
+  {
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using integer_type = typename ArrayType::integer_type;
+
+    ArrayType u;
+    u.set_value(val);
+    auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
+
+    for (size_t i = 0; i < u.array_size(); ++i) {
+#if defined(RAJA_USE_HIP_INTRINSICS)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+#else
+      atomicExch(&ptr[i], u.array[i]);
+#endif
+    }
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
+  {
+#if defined(RAJA_USE_HIP_INTRINSICS)
+    __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+#else
+    __threadfence();
+#endif
+  }
+
+  static RAJA_DEVICE RAJA_INLINE void fence_release()
+  {
+#if defined(RAJA_USE_HIP_INTRINSICS)
+    __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+    // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+#else
+    __threadfence();
+#endif
+  }
+};
+
+
+// hip only has shfl primitives for 32 bits
+constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
+constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
+
+/*!
+ ******************************************************************************
+ *
+ * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
+ *
+ * \Note Returns an undefined value if src lane is inactive (divergence).
+ *       Returns this lane's value if src lane is out of bounds or has exited.
+ *
+ ******************************************************************************
+ */
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
+  }
+  return u.get_value();
+}
+
+template <typename T>
+RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
+{
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  u.set_value(var);
+
+  for (size_t i = 0; i < u.array_size(); ++i) {
+    u.array[i] = ::__shfl(u.array[i], srcLane);
+  }
+  return u.get_value();
+}
+
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
+{
+  return ::__shfl_xor(var, laneMask);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+template <>
+RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
+{
+  return ::__shfl(var, srcLane);
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  T temp = val;
+
+  if (numThreads % policy::hip::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  return temp;
+}
+
+/*!
+ * Allreduce values in a warp.
+ *
+ *
+ * This does a butterfly pattern leaving each lane with the full reduction
+ *
+ */
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
+{
+  T temp = val;
+
+  for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+    T rhs = shfl_xor_sync(temp, i);
+    Combiner{}(temp, rhs);
+  }
+
+  return temp;
+}
+
+
+//! reduce values in block into thread 0
+template <typename Combiner, typename T>
+RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
+{
+  int numThreads = blockDim.x * blockDim.y * blockDim.z;
+
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int warpId = threadId % policy::hip::WARP_SIZE;
+  int warpNum = threadId / policy::hip::WARP_SIZE;
+
+  T temp = val;
+
+  if (numThreads % policy::hip::WARP_SIZE == 0) {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      T rhs = shfl_xor_sync(temp, i);
+      Combiner{}(temp, rhs);
+    }
+
+  } else {
+
+    // reduce each warp
+    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
+      int srcLane = threadId ^ i;
+      T rhs = shfl_sync(temp, srcLane);
+      // only add from threads that exist (don't double count own value)
+      if (srcLane < numThreads) {
+        Combiner{}(temp, rhs);
+      }
+    }
+  }
+
+  // reduce per warp values
+  if (numThreads > policy::hip::WARP_SIZE) {
+
+    static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE,
+        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+
+    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
+    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
+      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
+
+    // write per warp values to shared memory
+    if (warpId == 0) {
+      sd->set(warpNum, temp);
+    }
+
+    __syncthreads();
+
+    if (warpNum == 0) {
+
+      // read per warp values
+      if (warpId * policy::hip::WARP_SIZE < numThreads) {
+        temp = sd->get(warpId);
+      } else {
+        temp = identity;
+      }
+
+      for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) {
+        T rhs = shfl_xor_sync(temp, i);
+        Combiner{}(temp, rhs);
+      }
+    }
+
+    __syncthreads();
+  }
+
+  return temp;
+}
+
+}  // end namespace impl
+
+}  // end namespace hip
+
+}  // end namespace RAJA
+
+#endif  // closing endif for RAJA_ENABLE_HIP guard
+
+#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index c814bec83d..6a53e91177 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -154,6 +154,17 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   }
 };
 
+template < size_t t_replication, size_t t_atomic_stride,
+           bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host >
+struct ReduceTuning
+{
+  static constexpr size_t replication = t_replication;
+  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr bool maybe_atomic = t_maybe_atomic;
+  static constexpr bool avoid_fences = t_avoid_fences;
+  static constexpr bool init_on_host = t_init_on_host;
+};
+
 }  // namespace hip
 
 namespace policy
@@ -229,9 +240,9 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template <bool maybe_atomic, size_t replication=named_usage::unspecified,
-                             size_t atomic_stride=named_usage::unspecified>
-struct hip_reduce_base
+
+template < typename tuning >
+struct hip_reduce_policy
     : public RAJA::
           make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
                                                 RAJA::Pattern::reduce,
@@ -252,9 +263,32 @@ struct hip_atomic_explicit{};
  */
 using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
-using hip_reduce = hip_reduce_base<false>;
+template < bool maybe_atomic,
+           size_t replication = named_usage::unspecified,
+           size_t atomic_stride = named_usage::unspecified,
+           bool init_on_host = false,
+           bool avoid_fences = false >
+using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning<
+    replication, atomic_stride,
+    maybe_atomic, init_on_host, avoid_fences> >;
+
+using hip_reduce_with_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
+
+using hip_reduce_avoid_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
+
+using hip_reduce_atomic_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
+
+using hip_reduce_atomic_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
+
+using hip_reduce_atomic_host_init = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+
+#if defined(RAJA_USE_HIP_INTRINSICS)
+using hip_reduce = hip_reduce_avoid_fences;
+#else
+using hip_reduce = hip_reduce_with_fences;
+#endif
 
-using hip_reduce_atomic = hip_reduce_base<true>;
+using hip_reduce_atomic = hip_reduce_atomic_host_init;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
@@ -1059,6 +1093,11 @@ using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
 // policies usable with reducers
+using policy::hip::hip_reduce_with_fences;
+using policy::hip::hip_reduce_avoid_fences;
+using policy::hip::hip_reduce_atomic_with_fences;
+using policy::hip::hip_reduce_atomic_avoid_fences;
+using policy::hip::hip_reduce_atomic_host_init;
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 187de47ee2..6579633957 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -40,6 +40,7 @@
 #include "RAJA/pattern/reduce.hpp"
 
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
+#include "RAJA/policy/hip/intrinsics.hpp"
 #include "RAJA/policy/hip/atomic.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
@@ -52,6 +53,7 @@ namespace reduce
 
 namespace hip
 {
+
 //! atomic operator version of Combiner object
 template <typename Combiner>
 struct atomic;
@@ -80,6 +82,22 @@ struct atomic<max<T>> {
   }
 };
 
+template <typename T>
+struct atomic<and_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicAnd<T>(RAJA::hip_atomic{}, &val, v);
+  }
+};
+
+template <typename T>
+struct atomic<or_bit<T>> {
+  RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
+  {
+    RAJA::atomicOr<T>(RAJA::hip_atomic{}, &val, v);
+  }
+};
+
 template <typename T>
 struct hip_atomic_available {
   static constexpr const bool value =
@@ -97,264 +115,18 @@ namespace hip
 namespace impl
 {
 
-/*!
- * \brief Abstracts T into an equal or greater size array of integers whose
- * size is between min_integer_type_size and max_interger_type_size inclusive.
- */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(long long)>
-union AsIntegerArray {
-
-  static_assert(min_integer_type_size <= max_integer_type_size,
-                "incompatible min and max integer type size");
-  using integer_type = typename std::conditional<
-      ((alignof(T) >= alignof(long long) &&
-        sizeof(long long) <= max_integer_type_size) ||
-       sizeof(long) < min_integer_type_size),
-      long long,
-      typename std::conditional<
-          ((alignof(T) >= alignof(long) &&
-            sizeof(long) <= max_integer_type_size) ||
-           sizeof(int) < min_integer_type_size),
-          long,
-          typename std::conditional<
-              ((alignof(T) >= alignof(int) &&
-                sizeof(int) <= max_integer_type_size) ||
-               sizeof(short) < min_integer_type_size),
-              int,
-              typename std::conditional<
-                  ((alignof(T) >= alignof(short) &&
-                    sizeof(short) <= max_integer_type_size) ||
-                   sizeof(char) < min_integer_type_size),
-                  short,
-                  typename std::conditional<
-                      ((alignof(T) >= alignof(char) &&
-                        sizeof(char) <= max_integer_type_size)),
-                      char,
-                      void>::type>::type>::type>::type>::type;
-  static_assert(!std::is_same<integer_type, void>::value,
-                "could not find a compatible integer type");
-  static_assert(sizeof(integer_type) >= min_integer_type_size,
-                "integer_type smaller than min integer type size");
-  static_assert(sizeof(integer_type) <= max_integer_type_size,
-                "integer_type greater than max integer type size");
-
-  constexpr static size_t num_integer_type =
-      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
-
-  T value;
-  integer_type array[num_integer_type];
-
-  RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){};
-
-  RAJA_HOST_DEVICE constexpr size_t array_size() const
-  {
-    return num_integer_type;
-  }
-};
-
-// hip only has shfl primitives for 32 bits
-constexpr const size_t min_shfl_int_type_size = sizeof(int);
-constexpr const size_t max_shfl_int_type_size = sizeof(int);
-
-/*!
- ******************************************************************************
- *
- * \brief Method to shuffle 32b registers in sum reduction for arbitrary type.
- *
- * \Note Returns an undefined value if src lane is inactive (divergence).
- *       Returns this lane's value if src lane is out of bounds or has exited.
- *
- ******************************************************************************
- */
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-    u.array[i] = ::__shfl_xor(u.array[i], laneMask);
-  }
-  return u.value;
-}
-
-template <typename T>
-RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
-{
-  AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u(var);
-
-  for (size_t i = 0; i < u.array_size(); ++i) {
-    u.array[i] = ::__shfl(u.array[i], srcLane);
-  }
-  return u.value;
-}
-
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
-{
-  return ::__shfl_xor(var, laneMask);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-template <>
-RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
-{
-  return ::__shfl(var, srcLane);
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
-{
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  T temp = val;
-
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  return temp;
-}
-
-/*!
- * Allreduce values in a warp.
- *
- *
- * This does a butterfly pattern leaving each lane with the full reduction
- *
- */
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
-{
-  T temp = val;
-
-  for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-    T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
-  }
-
-  return temp;
-}
-
-
-//! reduce values in block into thread 0
-template <typename Combiner, typename T>
-RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
-{
-  int numThreads = blockDim.x * blockDim.y * blockDim.z;
-
-  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
-                 (blockDim.x * blockDim.y) * threadIdx.z;
-
-  int warpId = threadId % policy::hip::WARP_SIZE;
-  int warpNum = threadId / policy::hip::WARP_SIZE;
-
-  T temp = val;
-
-  if (numThreads % policy::hip::WARP_SIZE == 0) {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
-    }
-
-  } else {
-
-    // reduce each warp
-    for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) {
-      int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
-      // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
-      }
-    }
-  }
-
-  // reduce per warp values
-  if (numThreads > policy::hip::WARP_SIZE) {
-
-    static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE,
-        "Max Warps must be less than or equal to Warp Size for this algorithm to work");
-
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>)];
-    RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::MAX_WARPS> *>(tmpsd);
-
-    // write per warp values to shared memory
-    if (warpId == 0) {
-      sd->set(warpNum, temp);
-    }
-
-    __syncthreads();
-
-    if (warpNum == 0) {
-
-      // read per warp values
-      if (warpId * policy::hip::WARP_SIZE < numThreads) {
-        temp = sd->get(warpId);
-      } else {
-        temp = identity;
-      }
-
-      for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) {
-        T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
-      }
-    }
-
-    __syncthreads();
-  }
-
-  return temp;
-}
-
-
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, int replication, int atomic_stride,
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
           typename T, typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
                                         T identity,
-                                        TempIterator device_mem,
+                                        TempIterator in_device_mem,
                                         unsigned int* device_count)
 {
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -388,7 +160,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   if (threadId == 0) {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
-    __threadfence();
+    Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
     unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
     isLastBlock = (old_count == (numSlots-1));
@@ -400,6 +172,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
   // last block accumulates values from device_mem
   if (isLastBlock) {
     temp = identity;
+    Accessor::fence_acquire();
 
     for (unsigned int i = threadId;
                       i < numSlots;
@@ -526,6 +299,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
   // last block accumulates values from device_mem
   if (lastBlock) {
     temp = OP::identity();
+    __threadfence();
 
     for (int i = threadId; i < numBlocks; i += numThreads) {
       temp = OP{}(temp, red.device_mem.get(i));
@@ -545,7 +319,9 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, int replication, int atomic_stride, typename T>
+template <typename Combiner, typename Accessor,
+          int replication, int atomic_stride,
+          typename T>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
                                                T identity,
                                                T* device_mem,
@@ -576,8 +352,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
   if (threadId == 0) {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
     if (old_val == 0u) {
-      device_mem[atomicOffset] = identity; // consider making this atomic
-      __threadfence();
+      Accessor::set(device_mem, atomicOffset, identity);
+      Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
     }
   }
@@ -590,34 +366,59 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
-    __threadfence();
+    Accessor::fence_acquire();
     RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
-    __threadfence();
+    Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
     unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
     isLastBlock = (old_count == (numSlots+1));
 
     // the last block for each replication gets the value from device_mem
     if (isLastBlock) {
-      val = device_mem[atomicOffset]; // consider making this atomic
+      Accessor::fence_acquire();
+      val = Accessor::get(device_mem, atomicOffset);
     }
   }
 
   return isLastBlock ? replicationId : replication;
 }
 
+//! reduce values in block into thread 0 and atomically combines into device_mem
+template <typename Combiner, int replication, int atomic_stride, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val,
+                                                            T identity,
+                                                            T* device_mem)
+{
+  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
+                 (blockDim.x * blockDim.y) * threadIdx.z;
+
+  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
+                (gridDim.x * gridDim.y) * blockIdx.z;
+
+  int replicationId = (blockId%replication);
+  int atomicOffset = replicationId*atomic_stride;
+
+  T temp = block_reduce<Combiner>(val, identity);
+
+  // one thread per block performs an atomic on device_mem
+  if (threadId == 0 && temp != identity) {
+    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  }
+
+}
+
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T, size_t replication>
+template <typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
   struct Node {
     Node* next;
-    T values[replication];
+    T values[num_slots];
   };
   //! Object per resource to keep track of pinned memory nodes
   struct ResourceNode {
@@ -692,7 +493,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[replication] { return m_n->values; }
+    auto operator*() -> T(&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -729,7 +530,7 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[replication]
+  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
@@ -746,7 +547,7 @@ class PinnedTally
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = hip::pinned_mempool_type::getInstance().template malloc<Node>(1);
+    Node* n = mempool::getInstance().template malloc<Node>(1);
     n->next = rn->node_list;
     rn->node_list = n;
     return n->values;
@@ -769,7 +570,7 @@ class PinnedTally
       while (rn->node_list) {
         Node* n = rn->node_list;
         rn->node_list = n->next;
-        hip::pinned_mempool_type::getInstance().free(n);
+        mempool::getInstance().free(n);
       }
       resource_list = rn->next;
       free(rn);
@@ -796,14 +597,20 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename T,
+template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct Reduce_Data {
+struct Reduce_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
   unsigned int* device_count;
-  RAJA::detail::SoAPtr<T, device_mempool_type> device;
+  RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
   Reduce_Data() : Reduce_Data(T(), T()){};
@@ -836,9 +643,9 @@ struct Reduce_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[replication])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < replication; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -849,9 +656,9 @@ struct Reduce_Data {
   void grid_reduce(T* output)
   {
     T temp = value;
-
-    size_t replicationId = impl::grid_reduce<Combiner, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -867,7 +674,7 @@ struct Reduce_Data {
       size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
       device.allocate(maxNumSlots*replication);
-      device_count = device_zeroed_mempool_type::getInstance()
+      device_count = count_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
@@ -881,7 +688,7 @@ struct Reduce_Data {
     bool act = own_device_ptr;
     if (act) {
       device.deallocate();
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -893,7 +700,93 @@ struct Reduce_Data {
 //! Reduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomic_Data {
+struct ReduceAtomicInitialized_Data
+{
+  using tally_mempool_type = device_pinned_mempool_type;
+
+  static constexpr size_t tally_slots = replication * atomic_stride;
+
+  mutable T value;
+  T identity;
+  bool is_setup;
+  bool own_device_ptr;
+
+  ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){}
+
+  ReduceAtomicInitialized_Data(T initValue, T identity_)
+      : value{initValue},
+        identity{identity_},
+        is_setup{false},
+        own_device_ptr{false}
+  {
+  }
+
+  RAJA_HOST_DEVICE
+  ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other)
+      : value{other.identity},
+        identity{other.identity},
+        is_setup{other.is_setup},
+        own_device_ptr{false}
+  {
+  }
+
+  ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default;
+
+  //! initialize output to identity to ensure never read
+  //  uninitialized memory
+  T* init_grid_vals(T(&output)[tally_slots])
+  {
+    for (size_t r = 0; r < tally_slots; ++r) {
+      output[r] = identity;
+    }
+    return &output[0];
+  }
+
+  //! reduce values in grid to single value, store in output
+  RAJA_DEVICE
+  void grid_reduce(T* output)
+  {
+    T temp = value;
+
+    impl::grid_reduce_atomic_initialized<Combiner, replication, atomic_stride>(
+            temp, identity, output);
+  }
+
+  //! check and setup for device
+  //  allocate device pointers and get a new result buffer from the pinned tally
+  bool setupForDevice()
+  {
+    bool act = !is_setup && setupReducers();
+    if (act) {
+      is_setup = true;
+      own_device_ptr = true;
+    }
+    return act;
+  }
+
+  //! if own resources teardown device setup
+  //  free device pointers
+  bool teardownForDevice()
+  {
+    bool act = own_device_ptr;
+    if (act) {
+      is_setup = false;
+      own_device_ptr = false;
+    }
+    return act;
+  }
+};
+
+//! Reduction data for Hip Offload -- stores value, host pointer
+template <typename Combiner, typename Accessor, typename T,
+          size_t replication, size_t atomic_stride>
+struct ReduceAtomic_Data
+{
+  using tally_mempool_type = pinned_mempool_type;
+  using data_mempool_type = device_mempool_type;
+  using count_mempool_type = device_zeroed_mempool_type;
+
+  static constexpr size_t tally_slots = replication;
 
   mutable T value;
   T identity;
@@ -926,9 +819,9 @@ struct ReduceAtomic_Data {
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[replication])
+  T* init_grid_vals(T(&output)[tally_slots])
   {
-    for (size_t r = 0; r < replication; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r) {
       output[r] = identity;
     }
     return &output[0];
@@ -940,8 +833,9 @@ struct ReduceAtomic_Data {
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic<Combiner, replication, atomic_stride>(
-            temp, identity, device, device_count);
+    size_t replicationId = impl::grid_reduce_atomic<
+        Combiner, Accessor, replication, atomic_stride>(
+          temp, identity, device, device_count);
     if (replicationId != replication) {
       output[replicationId] = temp;
     }
@@ -953,8 +847,8 @@ struct ReduceAtomic_Data {
   {
     bool act = !device && setupReducers();
     if (act) {
-      device = device_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = device_zeroed_mempool_type::getInstance()
+      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
+      device_count = count_mempool_type::getInstance()
                          .template malloc<unsigned int>(replication*atomic_stride);
       own_device_ptr = true;
     }
@@ -967,9 +861,9 @@ struct ReduceAtomic_Data {
   {
     bool act = own_device_ptr;
     if (act) {
-      device_mempool_type::getInstance().free(device);
+      data_mempool_type::getInstance().free(device);
       device = nullptr;
-      device_zeroed_mempool_type::getInstance().free(device_count);
+      count_mempool_type::getInstance().free(device_count);
       device_count = nullptr;
       own_device_ptr = false;
     }
@@ -977,11 +871,47 @@ struct ReduceAtomic_Data {
   }
 };
 
+
 //! Hip Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T,
-          bool maybe_atomic, size_t t_replication, size_t t_atomic_stride>
+template <typename Combiner, typename T, typename tuning>
 class Reduce
 {
+  static constexpr size_t replication = (tuning::replication > 0)
+      ? tuning::replication
+      : 32;
+  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
+      ? tuning::atomic_stride
+      : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+        ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
+        : 1);
+
+  static constexpr bool use_atomic = tuning::maybe_atomic &&
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
+
+  using Accessor = std::conditional_t<tuning::avoid_fences,
+      impl::AccessorAvoidingFences,
+      impl::AccessorWithFences>;
+
+  //! hip reduction data storage class and folding algorithm
+  using reduce_data_type = std::conditional_t<use_atomic,
+      std::conditional_t<tuning::init_on_host,
+        hip::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+        hip::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>>,
+      hip::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>>;
+
+  static constexpr size_t tally_slots = reduce_data_type::tally_slots;
+
+  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+
+  //! union to hold either pointer to PinnedTally or pointer to value
+  //  only use list before setup for device and only use val_ptr after
+  union tally_u {
+    TallyType* list;
+    T* val_ptr;
+    constexpr tally_u(TallyType* l) : list(l){};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+  };
+
 public:
   Reduce() : Reduce(T(), Combiner::identity()) {}
 
@@ -989,7 +919,7 @@ class Reduce
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
       : parent{this},
-        tally_or_val_ptr{new PinnedTally<T, replication>},
+        tally_or_val_ptr{new TallyType},
         val(init_val, identity_)
   {
   }
@@ -1062,8 +992,8 @@ class Reduce
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
       for (; n != end; ++n) {
-        T(&values)[replication] = *n;
-        for (size_t r = 0; r < replication; ++r) {
+        T(&values)[tally_slots] = *n;
+        for (size_t r = 0; r < tally_slots; ++r) {
           Combiner{}(val.value, values[r]);
         }
       }
@@ -1087,47 +1017,20 @@ class Reduce
 
 private:
   const Reduce* parent;
-
-  static constexpr size_t replication = (t_replication > 0)
-      ? t_replication
-      : 32;
-  static constexpr size_t atomic_stride = (t_atomic_stride > 0)
-      ? t_atomic_stride
-      : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  //! union to hold either pointer to PinnedTally or poiter to value
-  //  only use list before setup for device and only use val_ptr after
-  union tally_u {
-    PinnedTally<T, replication>* list;
-    T* val_ptr;
-    constexpr tally_u(PinnedTally<T, replication>* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
-  };
-
   tally_u tally_or_val_ptr;
-
-  //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = typename std::conditional<
-      maybe_atomic && RAJA::reduce::hip::hip_atomic_available<T>::value,
-      hip::ReduceAtomic_Data<Combiner, T, replication, atomic_stride>,
-      hip::Reduce_Data<Combiner, T, replication, atomic_stride>>::type;
-
-  //! storage for reduction data
   reduce_data_type val;
 };
 
 }  // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceSum<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1139,13 +1042,13 @@ class ReduceSum<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceBitOr for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceBitOr<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1157,13 +1060,13 @@ class ReduceBitOr<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceBitAnd for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceBitAnd<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1175,13 +1078,13 @@ class ReduceBitAnd<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceMin for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceMin<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::min<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1193,13 +1096,13 @@ class ReduceMin<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceMax for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T>
-class ReduceMax<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
-    : public hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>
+template <typename tuning, typename T>
+class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
+    : public hip::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
 
 public:
-  using Base = hip::Reduce<RAJA::reduce::max<T>, T, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
@@ -1211,18 +1114,18 @@ class ReduceMax<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T>
 };
 
 //! specialization of ReduceMinLoc for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
-class ReduceMinLoc<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
     : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
                           RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          maybe_atomic, replication, atomic_stride>
+                          tuning>
 {
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
   using Combiner = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
@@ -1261,18 +1164,18 @@ class ReduceMinLoc<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T,
 };
 
 //! specialization of ReduceMaxLoc for hip_reduce
-template <bool maybe_atomic, size_t replication, size_t atomic_stride, typename T, typename IndexType>
-class ReduceMaxLoc<hip_reduce_base<maybe_atomic, replication, atomic_stride>, T, IndexType>
+template <typename tuning, typename T, typename IndexType>
+class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
     : public hip::
           Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
                  RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 maybe_atomic, replication, atomic_stride>
+                 tuning>
 {
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
   using Combiner = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, maybe_atomic, replication, atomic_stride>;
+  using Base = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index fc83f8999b..dc3caf86ef 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -56,6 +56,8 @@
 #define RAJA_HOST __host__
 #define RAJA_SUPPRESS_HD_WARN
 
+#define RAJA_USE_HIP_INTRINSICS
+
 #else
 
 #define RAJA_HOST_DEVICE
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index f19d9947b6..8441f75522 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -887,6 +887,75 @@ struct DefaultAccessor
   }
 };
 
+
+/*!
+ * \brief Abstracts T into an equal or greater size array of integers whose
+ * size is between min_integer_type_size and max_interger_type_size inclusive.
+ */
+template <typename T,
+          size_t min_integer_type_size = 1,
+          size_t max_integer_type_size = sizeof(unsigned long long)>
+struct AsIntegerArray
+{
+  static_assert(min_integer_type_size <= max_integer_type_size,
+                "incompatible min and max integer type size");
+  using integer_type = typename std::conditional<
+      ((alignof(T) >= alignof(unsigned long long) &&
+        sizeof(unsigned long long) <= max_integer_type_size) ||
+       sizeof(unsigned long) < min_integer_type_size),
+      unsigned long long,
+      typename std::conditional<
+          ((alignof(T) >= alignof(unsigned long) &&
+            sizeof(unsigned long) <= max_integer_type_size) ||
+           sizeof(unsigned int) < min_integer_type_size),
+          unsigned long,
+          typename std::conditional<
+              ((alignof(T) >= alignof(unsigned int) &&
+                sizeof(unsigned int) <= max_integer_type_size) ||
+               sizeof(unsigned short) < min_integer_type_size),
+              unsigned int,
+              typename std::conditional<
+                  ((alignof(T) >= alignof(unsigned short) &&
+                    sizeof(unsigned short) <= max_integer_type_size) ||
+                   sizeof(unsigned char) < min_integer_type_size),
+                  unsigned short,
+                  typename std::conditional<
+                      ((alignof(T) >= alignof(unsigned char) &&
+                        sizeof(unsigned char) <= max_integer_type_size)),
+                      unsigned char,
+                      void>::type>::type>::type>::type>::type;
+  static_assert(!std::is_same<integer_type, void>::value,
+                "could not find a compatible integer type");
+  static_assert(sizeof(integer_type) >= min_integer_type_size,
+                "integer_type smaller than min integer type size");
+  static_assert(sizeof(integer_type) <= max_integer_type_size,
+                "integer_type greater than max integer type size");
+
+  static constexpr size_t num_integer_type =
+      (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type);
+
+  integer_type array[num_integer_type] = {0};
+
+  AsIntegerArray() = default;
+
+  RAJA_HOST_DEVICE constexpr size_t array_size() const
+  {
+    return num_integer_type;
+  }
+
+  RAJA_HOST_DEVICE constexpr T get_value() const
+  {
+    T value;
+    memcpy(&value, &array[0], sizeof(T));
+    return value;
+  }
+
+  RAJA_HOST_DEVICE constexpr void set_value(T value)
+  {
+    memcpy(&array[0], &value, sizeof(T));
+  }
+};
+
 }  // namespace detail
 
 }  // namespace RAJA
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index d8d5fc670b..f6a5306c84 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -34,11 +34,19 @@ using OpenMPTargetReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce >;
+using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences,
+                                   RAJA::cuda_reduce_avoid_fences,
+                                   RAJA::cuda_reduce_atomic_with_fences,
+                                   RAJA::cuda_reduce_atomic_avoid_fences,
+                                   RAJA::cuda_reduce_atomic_host_init >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce >;
+using HipReducePols = camp::list< RAJA::hip_reduce_with_fences,
+                                  RAJA::hip_reduce_avoid_fences,
+                                  RAJA::hip_reduce_atomic_with_fences,
+                                  RAJA::hip_reduce_atomic_avoid_fences,
+                                  RAJA::hip_reduce_atomic_host_init >;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)

From a6da3e896a487cc115c379db2adf578558d3324c Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 7 Apr 2024 21:03:40 -0700
Subject: [PATCH 065/108] Add reduce atomic host with/avoid fences

This allows the choice of which fallback non-atomic policy is used
---
 include/RAJA/policy/cuda/policy.hpp  | 4 +++-
 include/RAJA/policy/hip/policy.hpp   | 4 +++-
 test/include/RAJA_test-reducepol.hpp | 6 ++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index b3b8ae04d1..c7815ecfa4 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -288,7 +288,9 @@ using cuda_reduce_atomic_with_fences = cuda_reduce_base<true, named_usage::unspe
 
 using cuda_reduce_atomic_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
 
-using cuda_reduce_atomic_host_init = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+using cuda_reduce_atomic_host_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+
+using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
 
 using cuda_reduce = cuda_reduce_with_fences;
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 6a53e91177..e89c4e16ad 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -280,7 +280,9 @@ using hip_reduce_atomic_with_fences = hip_reduce_base<true, named_usage::unspeci
 
 using hip_reduce_atomic_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
 
-using hip_reduce_atomic_host_init = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+using hip_reduce_atomic_host_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+
+using hip_reduce_atomic_host_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
 
 #if defined(RAJA_USE_HIP_INTRINSICS)
 using hip_reduce = hip_reduce_avoid_fences;
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index f6a5306c84..cd97a686ca 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -38,7 +38,8 @@ using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences,
                                    RAJA::cuda_reduce_avoid_fences,
                                    RAJA::cuda_reduce_atomic_with_fences,
                                    RAJA::cuda_reduce_atomic_avoid_fences,
-                                   RAJA::cuda_reduce_atomic_host_init >;
+                                   RAJA::cuda_reduce_atomic_host_with_fences,
+                                   RAJA::cuda_reduce_atomic_host_avoid_fences >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
@@ -46,7 +47,8 @@ using HipReducePols = camp::list< RAJA::hip_reduce_with_fences,
                                   RAJA::hip_reduce_avoid_fences,
                                   RAJA::hip_reduce_atomic_with_fences,
                                   RAJA::hip_reduce_atomic_avoid_fences,
-                                  RAJA::hip_reduce_atomic_host_init >;
+                                  RAJA::hip_reduce_atomic_host_with_fences,
+                                  RAJA::hip_reduce_atomic_host_avoid_fences >;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)

From c418614864dde1b045e9042ecaf2ccaa5f02a8fa Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 7 Apr 2024 21:04:08 -0700
Subject: [PATCH 066/108] change default cuda/hip reduction policies

---
 include/RAJA/policy/cuda/policy.hpp | 2 +-
 include/RAJA/policy/hip/policy.hpp  | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index c7815ecfa4..ce3b10b708 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -294,7 +294,7 @@ using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base<true, named_usage:
 
 using cuda_reduce = cuda_reduce_with_fences;
 
-using cuda_reduce_atomic = cuda_reduce_atomic_host_init;
+using cuda_reduce_atomic = cuda_reduce_atomic_avoid_fences;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index e89c4e16ad..379924e71c 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -284,13 +284,9 @@ using hip_reduce_atomic_host_with_fences = hip_reduce_base<true, named_usage::un
 
 using hip_reduce_atomic_host_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
 
-#if defined(RAJA_USE_HIP_INTRINSICS)
 using hip_reduce = hip_reduce_avoid_fences;
-#else
-using hip_reduce = hip_reduce_with_fences;
-#endif
 
-using hip_reduce_atomic = hip_reduce_atomic_host_init;
+using hip_reduce_atomic = hip_reduce_atomic_avoid_fences;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block

From 18904b6d40dcf92a654852668a78a609a53b8322 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 7 Apr 2024 21:13:06 -0700
Subject: [PATCH 067/108] fixup atomic host policies

---
 include/RAJA/policy/cuda/policy.hpp | 3 ++-
 include/RAJA/policy/hip/policy.hpp  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index ce3b10b708..5b1f3a00fb 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1177,7 +1177,8 @@ using policy::cuda::cuda_reduce_with_fences;
 using policy::cuda::cuda_reduce_avoid_fences;
 using policy::cuda::cuda_reduce_atomic_with_fences;
 using policy::cuda::cuda_reduce_atomic_avoid_fences;
-using policy::cuda::cuda_reduce_atomic_host_init;
+using policy::cuda::cuda_reduce_atomic_host_with_fences;
+using policy::cuda::cuda_reduce_atomic_host_avoid_fences;
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 379924e71c..c9c42f881a 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1095,7 +1095,8 @@ using policy::hip::hip_reduce_with_fences;
 using policy::hip::hip_reduce_avoid_fences;
 using policy::hip::hip_reduce_atomic_with_fences;
 using policy::hip::hip_reduce_atomic_avoid_fences;
-using policy::hip::hip_reduce_atomic_host_init;
+using policy::hip::hip_reduce_atomic_host_with_fences;
+using policy::hip::hip_reduce_atomic_host_avoid_fences;
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;

From 4ffd37a154b1e9243a41e8df49f4e99fcb333ca2 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Sun, 7 Apr 2024 21:25:32 -0700
Subject: [PATCH 068/108] Fix argument ordering

---
 include/RAJA/policy/cuda/policy.hpp | 2 +-
 include/RAJA/policy/hip/policy.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 5b1f3a00fb..ea4c5ca1c7 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -278,7 +278,7 @@ template < bool maybe_atomic,
            bool avoid_fences = false >
 using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
     replication, atomic_stride,
-    maybe_atomic, init_on_host, avoid_fences> >;
+    maybe_atomic, avoid_fences, init_on_host> >;
 
 using cuda_reduce_with_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index c9c42f881a..7bcb01c039 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -270,7 +270,7 @@ template < bool maybe_atomic,
            bool avoid_fences = false >
 using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning<
     replication, atomic_stride,
-    maybe_atomic, init_on_host, avoid_fences> >;
+    maybe_atomic, avoid_fences, init_on_host> >;
 
 using hip_reduce_with_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
 

From c0595f61b509b4d4acda82e4ec75bf6824e1d427 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 8 Apr 2024 08:02:20 -0700
Subject: [PATCH 069/108] Adjust default policies again

---
 include/RAJA/policy/cuda/policy.hpp | 2 +-
 include/RAJA/policy/hip/policy.hpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index ea4c5ca1c7..7bd895a2bc 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -294,7 +294,7 @@ using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base<true, named_usage:
 
 using cuda_reduce = cuda_reduce_with_fences;
 
-using cuda_reduce_atomic = cuda_reduce_atomic_avoid_fences;
+using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 7bcb01c039..f59bbac891 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -286,7 +286,7 @@ using hip_reduce_atomic_host_avoid_fences = hip_reduce_base<true, named_usage::u
 
 using hip_reduce = hip_reduce_avoid_fences;
 
-using hip_reduce_atomic = hip_reduce_atomic_avoid_fences;
+using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block

From 2b3cfeabc01ad571851082b69b1cc1e60690679f Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 17 Apr 2024 08:08:17 -0700
Subject: [PATCH 070/108] Add check for specific hip builtins

---
 include/RAJA/policy/hip/intrinsics.hpp | 9 +++++----
 include/RAJA/util/macros.hpp           | 7 +++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index 374a66323e..fe3ac0f35d 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -91,7 +91,7 @@ struct AccessorAvoidingFences
     auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
 
     for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
       u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
@@ -112,7 +112,7 @@ struct AccessorAvoidingFences
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
     for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
       __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
@@ -122,7 +122,7 @@ struct AccessorAvoidingFences
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -131,7 +131,8 @@ struct AccessorAvoidingFences
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS)
+#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
+                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
     __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index dc3caf86ef..55e90010d8 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -66,6 +66,13 @@
 #define RAJA_SUPPRESS_HD_WARN
 #endif
 
+
+#if defined(__has_builtin)
+#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) 0
+#endif
+
 /*!
  *******************************************************************************
  * \def RAJA_USED_ARG(x)

From 93ea8878762a501eedc7b56f7d5b95d291641b10 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 18 Apr 2024 14:21:21 -0700
Subject: [PATCH 071/108] Add RAJA::binary_tree_reduce

RAJA::binary_tree_reduceThis is a more accurate option when adding many
floating point numbers that uses a binary reduction tree pattern.
RAJA::accumulate is also added which adds numbers into a single counter.
---
 include/RAJA/RAJA.hpp                         |   6 +
 include/RAJA/util/Operators.hpp               |  21 +-
 include/RAJA/util/math.hpp                    |  75 ++++
 include/RAJA/util/reduce.hpp                  | 400 ++++++++++++++++++
 include/RAJA/util/sort.hpp                    |  21 +-
 test/unit/algorithm/CMakeLists.txt            |  56 ++-
 .../test-algorithm-util-reduce.cpp.in         |  36 ++
 .../algorithm/test-algorithm-util-sort.cpp.in |  12 +-
 .../tests/test-algorithm-reduce-utils.hpp     | 350 +++++++++++++++
 .../tests/test-algorithm-util-reduce.hpp      | 205 +++++++++
 10 files changed, 1135 insertions(+), 47 deletions(-)
 create mode 100644 include/RAJA/util/math.hpp
 create mode 100644 include/RAJA/util/reduce.hpp
 create mode 100644 test/unit/algorithm/test-algorithm-util-reduce.cpp.in
 create mode 100644 test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
 create mode 100644 test/unit/algorithm/tests/test-algorithm-util-reduce.hpp

diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 32522a1f0d..5478392ff1 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -33,6 +33,7 @@
 #include "RAJA/util/camp_aliases.hpp"
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/math.hpp"
 #include "RAJA/util/plugins.hpp"
 #include "RAJA/util/Registry.hpp"
 #include "RAJA/util/for_each.hpp"
@@ -156,6 +157,11 @@
 //
 #include "RAJA/util/sort.hpp"
 
+//
+// reduce algorithms
+//
+#include "RAJA/util/reduce.hpp"
+
 //
 // WorkPool, WorkGroup, WorkSite objects
 //
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index d76b862c22..b4249e7182 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -42,9 +42,20 @@ namespace operators
 namespace detail
 {
 
+// truly associative (does not include fp add/multiply)
 struct associative_tag {
 };
 
+// associative up to floating point rounding differences
+struct fp_associative_tag : associative_tag {
+};
+
+// get associativity tag appropriate for the type
+template < typename T >
+using associative_or_fp_associative_tag =
+  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                     fp_associative_tag, associative_tag>;
+
 template <typename Arg1, typename Arg2, typename Result>
 struct binary_function {
   using first_argument_type = Arg1;
@@ -327,7 +338,7 @@ static_assert(check<unsigned long long>(),
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_tag {
+              detail::associative_or_fp_associative_tag<Ret> {
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -347,7 +358,7 @@ struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
 
 template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_tag {
+                    detail::associative_or_fp_associative_tag<Ret> {
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
@@ -569,6 +580,12 @@ struct is_associative {
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
+template <typename T>
+struct is_fp_associative {
+  static constexpr const bool value =
+      std::is_base_of<detail::fp_associative_tag, T>::value;
+};
+
 template <typename Arg1, typename Arg2 = Arg1>
 struct safe_plus
     : public plus<Arg1,
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
new file mode 100644
index 0000000000..36c7cca1a0
--- /dev/null
+++ b/include/RAJA/util/math.hpp
@@ -0,0 +1,75 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA math templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_math_HPP
+#define RAJA_util_math_HPP
+
+#include "RAJA/config.hpp"
+
+#include <type_traits>
+#include <climits>
+
+namespace RAJA
+{
+
+/*!
+    \brief evaluate log base 2 of n
+
+    For positive n calculate log base 2 of n, and round the result down to the
+    nearest integer.
+    For zero or negative n return 0
+
+*/
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE RAJA_INLINE
+constexpr T log2(T n) noexcept
+{
+  T result = 0;
+  if (n > 0) {
+    while(n >>= 1) {
+      ++result;
+    }
+  }
+  return result;
+}
+
+/*!
+    \brief "round up" to the next greatest power of 2
+
+    For a integer n,
+      if n is non-negative,
+        if n is a power of 2, return n
+        if n is not a power of 2, return the next greater power of 2
+      if n is negative, return 0
+*/
+template < typename T,
+           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
+RAJA_HOST_DEVICE
+constexpr T next_pow2(T n) noexcept
+{
+  --n;
+  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+    n |= n >> s;
+  }
+  ++n;
+  return n;
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
new file mode 100644
index 0000000000..6d0c28f861
--- /dev/null
+++ b/include/RAJA/util/reduce.hpp
@@ -0,0 +1,400 @@
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file providing RAJA sort templates.
+*
+******************************************************************************
+*/
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#ifndef RAJA_util_reduce_HPP
+#define RAJA_util_reduce_HPP
+
+#include "RAJA/config.hpp"
+
+#include <climits>
+#include <iterator>
+#include <new>
+#include <type_traits>
+
+#include "RAJA/pattern/detail/algorithm.hpp"
+
+#include "RAJA/util/macros.hpp"
+#include "RAJA/util/concepts.hpp"
+#include "RAJA/util/math.hpp"
+#include "RAJA/util/Operators.hpp"
+
+namespace RAJA
+{
+
+namespace detail
+{
+
+/*!
+    \brief Reduce class that does a reduction with a left fold.
+*/
+template <typename T, typename BinaryOp>
+struct LeftFoldReduce
+{
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
+    , m_accumulated_value(std::move(init))
+  {
+
+  }
+
+  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
+  LeftFoldReduce(LeftFoldReduce &&) = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+
+  ~LeftFoldReduce() = default;
+
+
+  /*!
+      \brief reset the combined value of the reducer to the identity
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
+  {
+    m_accumulated_value = BinaryOp::identity();
+  }
+
+  /*!
+      \brief return the combined value and clear the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
+  {
+    T accumulated_value = std::move(m_accumulated_value);
+
+    clear();
+
+    return accumulated_value;
+  }
+
+  /*!
+      \brief return the combined value
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
+  {
+    return m_accumulated_value;
+  }
+
+  /*!
+      \brief combine a value into the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T val)
+  {
+    m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
+  }
+
+private:
+  BinaryOp m_op;
+  T m_accumulated_value;
+};
+
+/*!
+    \brief Reduce class that does a reduction with a binary tree.
+*/
+template <typename T, typename BinaryOp, typename SizeType = size_t,
+          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+struct BinaryTreeReduce
+{
+  static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
+  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+
+  static constexpr SizeType num_levels = t_num_levels;
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
+                                      BinaryOp op = BinaryOp{}) noexcept
+    : m_op(std::move(op))
+  {
+    combine(std::move(init));
+  }
+
+  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  ~BinaryTreeReduce()
+  {
+    clear();
+  }
+
+
+  /*!
+      \brief reset the combined value of the reducer to the identity
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void clear() noexcept
+  {
+    // destroy all values on the tree stack and reset count to 0
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+
+      if (m_count & mask) {
+
+        get_value(level)->~T();
+
+        m_count ^= mask;
+
+      }
+    }
+  }
+
+  /*!
+      \brief return the combined value and clear the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get_and_clear()
+  {
+    // accumulate all values
+    T value = BinaryOp::identity();
+
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+
+      if (m_count & mask) {
+
+        value = m_op(std::move(value), std::move(*get_value(level)));
+        get_value(level)->~T();
+
+        m_count ^= mask;
+      }
+    }
+
+    return value;
+  }
+
+  /*!
+      \brief return the combined value
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T get()
+  {
+    // accumulate all values
+    T value = BinaryOp::identity();
+
+    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+
+      if (count & mask) {
+
+        value = m_op(std::move(value), *get_value(level));
+
+        count ^= mask;
+      }
+    }
+
+    return value;
+  }
+
+  /*!
+      \brief combine a value into the reducer
+  */
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void combine(T value)
+  {
+    // accumulate values and store in the first unused level found
+    // clear values from used levels along the way
+    SizeType level = 0;
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+
+      value = m_op(std::move(*get_value(level)), std::move(value));
+      get_value(level)->~T();
+
+    }
+
+    new(get_storage(level)) T(std::move(value));
+
+    ++m_count;
+  }
+
+private:
+  BinaryOp m_op;
+
+  // A counter of the number of inputs combined.
+  // The bits of count indicate which levels of tree stack have a value
+  SizeType m_count = 0;
+
+  // Each level in tree stack has a value that holds the accumulation of 2^level
+  // values or is unused and has no value.
+  std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  void* get_storage(SizeType level)
+  {
+    return &m_tree_stack[level];
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE
+  T* get_value(SizeType level)
+  {
+#if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
+    // TODO: check that launder is supported in device code
+    return std::launder(reinterpret_cast<T*>(&m_tree_stack[level]));
+#else
+    return reinterpret_cast<T*>(&m_tree_stack[level]);
+#endif
+  }
+};
+
+
+template <typename T, typename BinaryOp>
+using HighAccuracyReduce = std::conditional_t<
+    RAJA::operators::is_fp_associative<T>::value,
+      BinaryTreeReduce<T, BinaryOp>,
+      LeftFoldReduce<T, BinaryOp>>;
+
+
+/*!
+    \brief Combine into a single value using a left fold with the given
+           operation using O(N) operations and O(1) memory
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T left_fold_reduce(Iter begin,
+                   Iter end,
+                   T init,
+                   BinaryOp op)
+{
+  LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+/*!
+    \brief reduce using a binary tree with the given operation
+           and using O(N) operations and O(lg(n)) memory
+
+    This is more accurate than sequentially adding into a single value for
+    floating point types.
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T binary_tree_reduce(Iter begin,
+                     Iter end,
+                     T init,
+                     BinaryOp op)
+{
+  using std::distance;
+  using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+
+/*!
+    \brief reducer that uses a high accuracy implementation when round-off error
+    is a concern, or a faster algorithm with it is not a concern
+*/
+template <typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE
+T high_accuracy_reduce(Iter begin,
+                        Iter end,
+                        T init,
+                        BinaryOp op)
+{
+  HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
+
+  for (; begin != end; ++begin) {
+
+    reducer.combine(*begin);
+
+  }
+
+  return reducer.get_and_clear();
+}
+
+}  // namespace detail
+
+/*!
+  \brief Accumulate given range to a single value
+  using a left fold algorithm in O(N) operations and O(1) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/accumulate
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+/*!
+  \brief Reduce given range to a single value
+  using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory
+    see https://en.cppreference.com/w/cpp/algorithm/reduce
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+/*!
+  \brief Reduce given range to a single value
+  using an algorithm with high accuracy when floating point round off is a
+  concern
+    see https://en.cppreference.com/w/cpp/algorithm/reduce
+*/
+template <typename Container,
+          typename T = detail::ContainerVal<Container>,
+          typename BinaryOp = operators::plus<T>>
+RAJA_HOST_DEVICE RAJA_INLINE
+concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+{
+  using std::begin;
+  using std::end;
+  static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
+                "BinaryOp must model BinaryFunction");
+
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+}
+
+}  // namespace RAJA
+
+#endif
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index f1eebfc282..bbec03dfe1 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -26,8 +26,8 @@
 #include "RAJA/pattern/detail/algorithm.hpp"
 
 #include "RAJA/util/macros.hpp"
-
 #include "RAJA/util/concepts.hpp"
+#include "RAJA/util/math.hpp"
 
 namespace RAJA
 {
@@ -35,23 +35,6 @@ namespace RAJA
 namespace detail
 {
 
-/*!
-    \brief evaluate log base 2 of N rounded down to the nearest integer >= 0
-*/
-RAJA_HOST_DEVICE RAJA_INLINE
-unsigned
-ulog2(size_t N)
-{
-  unsigned val = 0;
-
-  while (N > 1) {
-    val += 1;
-    N >>= 1;
-  }
-
-  return val;
-}
-
 /*!
     \brief unstable partition given range inplace using predicate function
     and using O(N) predicate evaluations and O(1) memory
@@ -426,7 +409,7 @@ intro_sort(Iter begin,
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*detail::ulog2(N);
+  unsigned max_depth = 2*RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   // limit max_depth statically in device code to allow compiler to remove recursion
diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt
index 0142a94ed3..ea93727d59 100644
--- a/test/unit/algorithm/CMakeLists.txt
+++ b/test/unit/algorithm/CMakeLists.txt
@@ -48,46 +48,62 @@ foreach( SORT_BACKEND ${SORT_BACKENDS} )
 endforeach()
 
 
-set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge )
-set( CUDA_UTIL_SORTS       Shell Heap Intro )
-set( HIP_UTIL_SORTS        Shell Heap Intro )
 
-macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_SORTS)
-  set( SORT_BACKEND ${SORT_BACKEND_in} )
-  set( SORT_SIZE ${SORT_SIZE_in} )
-  foreach( UTIL_SORT ${UTIL_SORTS} )
-    configure_file( test-algorithm-util-sort.cpp.in
-                    test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+macro(RAJA_GENERATE_ALGORITHM_UTIL_TESTS ALG ALG_BACKEND_in ALG_SIZE_in UTIL_ALGS)
+  set( ALG_BACKEND ${ALG_BACKEND_in} )
+  set( ALG_SIZE ${ALG_SIZE_in} )
+  foreach( UTIL_ALG ${UTIL_ALGS} )
+    configure_file( test-algorithm-util-${ALG}.cpp.in
+                    test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp )
 
-    raja_add_test( NAME test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}
-                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp )
+    raja_add_test( NAME test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}
+                   SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp )
 
-    target_include_directories(test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.exe
+    target_include_directories(test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.exe
                                PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests)
 
   endforeach()
-  unset( SORT_SIZE )
-  unset( SORT_BACKEND )
+  unset( ALG_SIZE )
+  unset( ALG_BACKEND )
 endmacro()
 
 
-RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Default "${SEQUENTIAL_UTIL_SORTS}" )
-RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Small "Insertion" )
+set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge )
+set( CUDA_UTIL_SORTS       Shell Heap Intro )
+set( HIP_UTIL_SORTS        Shell Heap Intro )
+
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Default "${SEQUENTIAL_UTIL_SORTS}" )
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Small "Insertion" )
 
 if(RAJA_ENABLE_CUDA)
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Small "${CUDA_UTIL_SORTS}" )
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Tiny "Insertion" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Small "${CUDA_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Tiny "Insertion" )
 endif()
 
 if(RAJA_ENABLE_HIP)
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Small "${HIP_UTIL_SORTS}" )
-  RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Tiny "Insertion" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Small "${HIP_UTIL_SORTS}" )
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Tiny "Insertion" )
 endif()
 
+
+set( UTIL_REDUCES BinaryTree Accumulate )
+
+RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" )
+
+if(RAJA_ENABLE_CUDA)
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Cuda Small "${UTIL_REDUCES}" )
+endif()
+
+if(RAJA_ENABLE_HIP)
+  RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Hip Small "${UTIL_REDUCES}" )
+endif()
+
+
 unset( SORT_BACKENDS )
 unset( SEQUENTIAL_UTIL_SORTS )
 unset( CUDA_UTIL_SORTS )
 unset( HIP_UTIL_SORTS )
+unset( UTIL_REDUCES )
 
 
 raja_add_test(
diff --git a/test/unit/algorithm/test-algorithm-util-reduce.cpp.in b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in
new file mode 100644
index 0000000000..d7dd20bcd2
--- /dev/null
+++ b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in
@@ -0,0 +1,36 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+//
+// test/include headers
+//
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+
+//
+// Header for tests in ./tests directory
+//
+// Note: CMake adds ./tests as an include dir for these tests.
+//
+#include "test-algorithm-util-reduce.hpp"
+
+
+//
+// Cartesian product of types used in parameterized tests
+//
+using @ALG_BACKEND@@UTIL_ALG@ReduceTypes =
+  Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@ReduceReducers,
+                                @ALG_BACKEND@ResourceList,
+                                ReduceValTypeList,
+                                ReduceMaxNList@ALG_SIZE@ > >::Types;
+
+//
+// Instantiate parameterized test
+//
+INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@,
+                                ReduceUnitTest,
+                                @ALG_BACKEND@@UTIL_ALG@ReduceTypes );
diff --git a/test/unit/algorithm/test-algorithm-util-sort.cpp.in b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
index 7dbb0dcd93..0555a9e9f0 100644
--- a/test/unit/algorithm/test-algorithm-util-sort.cpp.in
+++ b/test/unit/algorithm/test-algorithm-util-sort.cpp.in
@@ -22,15 +22,15 @@
 //
 // Cartesian product of types used in parameterized tests
 //
-using @SORT_BACKEND@@UTIL_SORT@SortTypes =
-  Test< camp::cartesian_product<@SORT_BACKEND@@UTIL_SORT@SortSorters,
-                                @SORT_BACKEND@ResourceList,
+using @ALG_BACKEND@@UTIL_ALG@SortTypes =
+  Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@SortSorters,
+                                @ALG_BACKEND@ResourceList,
                                 SortKeyTypeList,
-                                SortMaxNList@SORT_SIZE@ > >::Types;
+                                SortMaxNList@ALG_SIZE@ > >::Types;
 
 //
 // Instantiate parameterized test
 //
-INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@,
+INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@,
                                 SortUnitTest,
-                                @SORT_BACKEND@@UTIL_SORT@SortTypes );
+                                @ALG_BACKEND@@UTIL_ALG@SortTypes );
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
new file mode 100644
index 0000000000..5277a07684
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -0,0 +1,350 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing test infrastructure for reduce tests
+///
+
+#ifndef __TEST_ALGORITHM_REDUCE_UTILS_HPP__
+#define __TEST_ALGORITHM_REDUCE_UTILS_HPP__
+
+#include "RAJA_test-base.hpp"
+#include "RAJA_test-camp.hpp"
+#include "RAJA_test-forall-data.hpp"
+#include "type_helper.hpp"
+#include "RAJA_unit-test-forone.hpp"
+
+#include <string>
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+#include <type_traits>
+#include <algorithm>
+#include <chrono>
+#include <random>
+
+
+// tag classes to differentiate reduce by attributes and apply correct testing
+struct left_fold_reduce_tag { };
+struct unordered_reduce_tag { };
+
+struct reduce_interface_tag { };
+
+struct reduce_default_interface_tag { };
+struct reduce_init_interface_tag { };
+struct reduce_init_op_interface_tag { };
+
+
+// synchronize based on a RAJA execution policy
+template < typename policy >
+struct PolicySynchronize
+{
+  void synchronize()
+  {
+    // no synchronization needed
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA)
+// partial specialization for cuda_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::cuda_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::cuda_synchronize>(); }
+  }
+};
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+// partial specialization for hip_exec
+template < size_t BLOCK_SIZE, bool Async >
+struct PolicySynchronize<RAJA::hip_exec<BLOCK_SIZE, Async>>
+{
+  void synchronize()
+  {
+    if (Async) { RAJA::synchronize<RAJA::hip_synchronize>(); }
+  }
+};
+#endif
+
+
+template <typename Res,
+          typename interface_tag,
+          typename ValType>
+struct ReduceData;
+
+template <typename Res, typename ValType>
+struct ReduceData<Res, reduce_interface_tag, ValType>
+{
+  ValType* values = nullptr;
+  ValType* reduced_value = nullptr;
+  Res m_res;
+
+  template < typename RandomGenerator >
+  ReduceData(size_t N, Res res, RandomGenerator gen_random)
+    : m_res(res)
+  {
+    if (N > 0) {
+      values = m_res.template allocate<ValType>(N, camp::resources::MemoryAccess::Managed);
+    }
+    reduced_value = m_res.template allocate<ValType>(1, camp::resources::MemoryAccess::Managed);
+
+    for (size_t i = 0; i < N; i++) {
+      values[i] = gen_random();
+    }
+  }
+
+  void copy_data(size_t N)
+  {
+    if ( N == 0 ) return;
+  }
+
+  Res resource()
+  {
+    return m_res;
+  }
+
+  ReduceData(ReduceData const&) = delete;
+  ReduceData& operator=(ReduceData const&) = delete;
+
+  ~ReduceData()
+  {
+    if (values != nullptr) {
+      m_res.deallocate(values, camp::resources::MemoryAccess::Managed);
+      m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed);
+    }
+  }
+};
+
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_default_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N));
+  reducer.synchronize();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp,
+            Reducer reducer, reduce_interface_tag, reduce_init_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N), init);
+  reducer.synchronize();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename Reducer>
+void doReduce(ReduceData<Res, reduce_interface_tag, T> & data,
+            RAJA::Index_type N,
+            T init,
+            BinaryOp op,
+            Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag)
+{
+  data.copy_data(N);
+  data.resource().wait();
+  reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op);
+  reducer.synchronize();
+}
+
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+{
+  doReduce(data, N, init, op, test_reducer, si, ci);
+
+  T reduced_check_value = init;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
+  }
+
+  if (reduced_check_value != *data.reduced_value) {
+    return ::testing::AssertionFailure()
+           << test_reducer.name() << " (left fold reduce) " << test_name
+           << " (with N " << N << " with seed " << seed << ")"
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+template <typename Res,
+          typename T,
+          typename BinaryOp,
+          typename TestReducer,
+          typename BinaryOpInterface>
+::testing::AssertionResult testReduce(
+    const char* test_name,
+    const unsigned seed,
+    ReduceData<Res, reduce_interface_tag, T> & data,
+    RAJA::Index_type N,
+    T init,
+    BinaryOp op,
+    TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci)
+{
+  doReduce(data, N, init, op, test_reducer, si, ci);
+
+  T reduced_check_value = init;
+  for (RAJA::Index_type i = 0; i < N; i++) {
+    reduced_check_value = op(std::move(reduced_check_value), data.values[i]);
+  }
+
+  if (reduced_check_value != *data.reduced_value) {
+    return ::testing::AssertionFailure()
+           << test_reducer.name() << " (unordered reduce) " << test_name
+           << " (with N " << N << " with seed " << seed << ")"
+           << " incorrect " << *data.reduced_value
+           << ", expected " << reduced_check_value;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+
+template <typename ValType,
+          typename Reducer,
+          typename Res>
+void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+{
+  using reduce_category    = typename Reducer::reduce_category ;
+  using interface_category = typename Reducer::reduce_interface ;
+  using no_init_operator   = reduce_default_interface_tag;
+  using init_no_operator   = reduce_init_interface_tag;
+  using init_operator      = reduce_init_op_interface_tag;
+
+  std::mt19937 rng(seed);
+  RAJA::Index_type N = std::uniform_int_distribution<RAJA::Index_type>((MaxN+1)/2, MaxN)(rng);
+  std::uniform_int_distribution<RAJA::Index_type> dist(-N, N);
+
+  ReduceData<Res, interface_category, ValType> data(N, res, [&](){ return dist(rng); });
+
+  ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus<ValType>::identity(), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, no_init_operator{}));
+  ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_no_operator{}));
+  ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+  ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum<ValType>{},
+      reducer, reduce_category{}, interface_category{}, init_operator{}));
+}
+
+template <typename ValType,
+          typename Reducer,
+          typename Res>
+void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res)
+{
+  testReducerInterfaces<ValType>(seed, 0, reducer, res);
+  for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) {
+    testReducerInterfaces<ValType>(seed, n, reducer, res);
+  }
+}
+
+inline unsigned get_random_seed()
+{
+  static unsigned seed = std::random_device{}();
+  return seed;
+}
+
+
+TYPED_TEST_SUITE_P(ReduceUnitTest);
+
+template < typename T >
+class ReduceUnitTest : public ::testing::Test
+{ };
+
+TYPED_TEST_P(ReduceUnitTest, UnitReduce)
+{
+  using Reducer  = typename camp::at<TypeParam, camp::num<0>>::type;
+  using ResType  = typename camp::at<TypeParam, camp::num<1>>::type;
+  using ValType  = typename camp::at<TypeParam, camp::num<2>>::type;
+  using MaxNType = typename camp::at<TypeParam, camp::num<3>>::type;
+
+  unsigned seed = get_random_seed();
+  RAJA::Index_type MaxN = MaxNType::value;
+  Reducer reducer{};
+  ResType res = ResType::get_default();
+
+  testReducer<ValType>(seed, MaxN, reducer, res);
+}
+
+REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce);
+
+
+//
+// Key types for reduce tests
+//
+using ReduceValTypeList =
+  camp::list<
+              RAJA::Index_type,
+              int,
+#if defined(RAJA_TEST_EXHAUSTIVE)
+              unsigned,
+              long long,
+              unsigned long long,
+              float,
+#endif
+              double
+            >;
+
+// Max test lengths for reduce tests
+using ReduceMaxNListDefault =
+  camp::list<
+              camp::num<10000>
+            >;
+
+using ReduceMaxNListSmall =
+  camp::list<
+              camp::num<1000>
+            >;
+
+using ReduceMaxNListTiny =
+  camp::list<
+              camp::num<100>
+            >;
+
+#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__
+
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
new file mode 100644
index 0000000000..f2cb0dda8d
--- /dev/null
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -0,0 +1,205 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Header file containing Reducer classes for util reduce tests
+///
+
+#ifndef __TEST_ALGORITHM_UTIL_REDUCE_HPP__
+#define __TEST_ALGORITHM_UTIL_REDUCE_HPP__
+
+#include "test-algorithm-reduce-utils.hpp"
+
+
+template < typename test_policy >
+using ForoneSynchronize = PolicySynchronize<test_equivalent_exec_policy<test_policy>>;
+
+
+template < typename test_policy, typename platform = test_platform<test_policy> >
+struct BinaryTreeReduce;
+
+template < typename test_policy, typename platform = test_platform<test_policy> >
+struct Accumulate;
+
+
+template < typename test_policy >
+struct BinaryTreeReduce<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = unordered_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::binary_tree_reduce";
+  }
+
+  template < typename T, typename... Args >
+  void operator()(T* reduced_value, Args&&... args)
+  {
+    *reduced_value = RAJA::binary_tree_reduce(std::forward<Args>(args)...);
+  }
+};
+
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnHost>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = left_fold_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  const char* name()
+  {
+    return "RAJA::accumulate";
+  }
+
+  template < typename T, typename... Args >
+  void operator()(T* reduced_value, Args&&... args)
+  {
+    *reduced_value = RAJA::accumulate(std::forward<Args>(args)...);
+  }
+};
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+
+template < typename test_policy >
+struct BinaryTreeReduce<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = unordered_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  std::string m_name;
+
+  BinaryTreeReduce()
+    : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c);
+    });
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init);
+    });
+  }
+
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::binary_tree_reduce(c, init, op);
+    });
+  }
+};
+
+template < typename test_policy >
+struct Accumulate<test_policy, RunOnDevice>
+  : ForoneSynchronize<test_policy>
+{
+  using reduce_category = left_fold_reduce_tag;
+  using reduce_interface = reduce_interface_tag;
+
+  std::string m_name;
+
+  Accumulate()
+    : m_name(std::string("RAJA::accumulate<") + test_policy_info<test_policy>::name() + std::string(">"))
+  { }
+
+  const char* name()
+  {
+    return m_name.c_str();
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c);
+    });
+  }
+
+  template < typename T, typename Container >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init);
+    });
+  }
+
+  template < typename T, typename Container, typename BinaryOp >
+  void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal<Container> init, BinaryOp op)
+  {
+    forone<test_policy>( [=] RAJA_DEVICE() {
+      *reduced_value = RAJA::accumulate(c, init, op);
+    });
+  }
+};
+
+#endif
+
+
+using SequentialBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_seq>
+            >;
+
+using SequentialAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_seq>
+            >;
+
+#if defined(RAJA_ENABLE_CUDA)
+
+using CudaBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_cuda>
+            >;
+
+using CudaAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_cuda>
+            >;
+
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+
+using HipBinaryTreeReduceReducers =
+  camp::list<
+              BinaryTreeReduce<test_hip>
+            >;
+
+using HipAccumulateReduceReducers =
+  camp::list<
+              Accumulate<test_hip>
+            >;
+
+#endif
+
+#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__
+

From b8cfadfaf3b22effb3cfda89c698d6d2be5912bc Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 18 Apr 2024 16:19:36 -0700
Subject: [PATCH 072/108] Use the higher accuracy reducer in cuda and hip
 reducers

---
 include/RAJA/policy/cuda/reduce.hpp | 6 +++++-
 include/RAJA/policy/hip/reduce.hpp  | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index ccb310d2f9..698a259375 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -35,6 +35,7 @@
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
@@ -995,12 +996,15 @@ class Reduce
     auto end = tally_or_val_ptr.list->end();
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
+      ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(std::move(val.value));
       for (; n != end; ++n) {
         T(&values)[tally_slots] = *n;
         for (size_t r = 0; r < tally_slots; ++r) {
-          Combiner{}(val.value, values[r]);
+          reducer.combine(std::move(values[r]));
         }
       }
+      val.value = reducer.get_and_clear();
       tally_or_val_ptr.list->free_list();
     }
     return val.value;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 6579633957..9b8c625c22 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -35,6 +35,7 @@
 #include "RAJA/util/basic_mempool.hpp"
 #include "RAJA/util/mutex.hpp"
 #include "RAJA/util/types.hpp"
+#include "RAJA/util/reduce.hpp"
 
 #include "RAJA/pattern/detail/reduce.hpp"
 #include "RAJA/pattern/reduce.hpp"
@@ -991,12 +992,15 @@ class Reduce
     auto end = tally_or_val_ptr.list->end();
     if (n != end) {
       tally_or_val_ptr.list->synchronize_resources();
+      ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
+          reducer(std::move(val.value));
       for (; n != end; ++n) {
         T(&values)[tally_slots] = *n;
         for (size_t r = 0; r < tally_slots; ++r) {
-          Combiner{}(val.value, values[r]);
+          reducer.combine(std::move(values[r]));
         }
       }
+      val.value = reducer.get_and_clear();
       tally_or_val_ptr.list->free_list();
     }
     return val.value;

From aae46e9fb3a22aa61a9dbe7d5d0ff6bfcf715bc9 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 10:31:48 -0700
Subject: [PATCH 073/108] Add some documentation for new cuda/hip reduction
 policies

---
 docs/sphinx/user_guide/feature/policies.rst | 56 +++++++++++++--------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index aad065cb16..9222af59c4 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -743,26 +743,42 @@ It is important to note the following constraints about RAJA reduction usage:
 
 The following table summarizes RAJA reduction policy types:
 
-======================= ============= ==========================================
-Reduction Policy        Loop Policies Brief description
-                        to Use With
-======================= ============= ==========================================
-seq_reduce              seq_exec,     Non-parallel (sequential) reduction.
-omp_reduce              any OpenMP    OpenMP parallel reduction.
-                        policy
-omp_reduce_ordered      any OpenMP    OpenMP parallel reduction with result
-                        policy        guaranteed to be reproducible.
-omp_target_reduce       any OpenMP    OpenMP parallel target offload reduction.
-                        target policy
-cuda/hip_reduce         any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
-                        policy        (device synchronization will occur when
-                                      reduction value is finalized).
-cuda/hip_reduce_atomic  any CUDA/HIP  Same as above, but reduction may use CUDA
-                        policy        atomic operations.
-sycl_reduce             any SYCL      Reduction in a SYCL kernel (device
-                        policy        synchronization will occur when the
-                                      reduction value is finalized).
-======================= ============= ==========================================
+======================================== ============= ==========================================
+Reduction Policy                         Loop Policies Brief description
+                                         to Use With
+======================================== ============= ==========================================
+seq_reduce                               seq_exec,     Non-parallel (sequential) reduction.
+omp_reduce                               any OpenMP    OpenMP parallel reduction.
+                                         policy
+omp_reduce_ordered                       any OpenMP    OpenMP parallel reduction with result
+                                         policy        guaranteed to be reproducible.
+omp_target_reduce                        any OpenMP    OpenMP parallel target offload reduction.
+                                         target policy
+cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
+                                         policy        (device synchronization will occur when
+                                                       reduction value is finalized).
+cuda/hip_reduce\*atomic\*                any CUDA/HIP  Same as above, but reduction may use
+                                         policy        atomic operations and initializes the
+                                                       memory used for atomics on the device.
+                                                       This works on all architectures but
+                                                       incurs higher overheads.
+cuda/hip_reduce\*atomic_host\*           any CUDA/HIP  Same as above, but reduction may use
+                                         policy        atomic operations and initializes the
+                                                       memory used for atomics on the host.
+                                                       This works on recent architectures and
+                                                       incurs lower overheads.
+cuda/hip_reduce\*with_fences             any CUDA/HIP  Same as above, and reduction uses normal
+                                         policy        memory accesses with device scope fences.
+                                                       This works on all architectures but
+                                                       incurs higher overheads.
+cuda/hip_reduce\*avoid_fences            any CUDA/HIP  Same as above, and reduction uses special
+                                         policy        memory accesses to allow it to avoid
+                                                       device scope fences. This improves
+                                                       performance on some architectures.
+sycl_reduce                              any SYCL      Reduction in a SYCL kernel (device
+                                         policy        synchronization will occur when the
+                                                       reduction value is finalized).
+======================================== ============= ==========================================
 
 .. note:: RAJA reductions used with SIMD execution policies are not
           guaranteed to generate correct results. So they should not be used

From 0f2790e94edb3c6a287008241f716e6231bf1b47 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 10:57:01 -0700
Subject: [PATCH 074/108] improve code docs of reduce policies

---
 include/RAJA/policy/cuda/policy.hpp | 31 ++++++++++++++++++++++++-----
 include/RAJA/policy/hip/policy.hpp  | 31 ++++++++++++++++++++++++-----
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 7bd895a2bc..c86822763b 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -280,20 +280,41 @@ using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
     replication, atomic_stride,
     maybe_atomic, avoid_fences, init_on_host> >;
 
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results and falls back
+//   on a non-atomic policy when atomics can't be used with the given type. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions run to run. The memory used with
+//   atomics is initialized on the device which can be expensive on some HW.
+//   On some HW this is faster overall than the non-atomic policies.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host which is
+//   significantly cheaper on some HW. On some HW this is faster overall than
+//   the non-atomic and atomic policies.
+// - *with_fences policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *avoid_fences policies use special (atomic) memory accesses that only cache
+//                 in a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
 using cuda_reduce_with_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
-
+///
 using cuda_reduce_avoid_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
-
+///
 using cuda_reduce_atomic_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
-
+///
 using cuda_reduce_atomic_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
-
+///
 using cuda_reduce_atomic_host_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
-
+///
 using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
 
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
 using cuda_reduce = cuda_reduce_with_fences;
 
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
 using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences;
 
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index f59bbac891..b63a8690ad 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -272,20 +272,41 @@ using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning<
     replication, atomic_stride,
     maybe_atomic, avoid_fences, init_on_host> >;
 
+// Policies for RAJA::Reduce* objects with specific behaviors.
+// - *atomic* policies may use atomics to combine partial results and falls back
+//   on a non-atomic policy when atomics can't be used with the given type. The
+//   use of atomics leads to order of operation differences which change the
+//   results of floating point sum reductions run to run. The memory used with
+//   atomics is initialized on the device which can be expensive on some HW.
+//   On some HW this is faster overall than the non-atomic policies.
+// - *atomic_host* policies are similar to the atomic policies above. However
+//   the memory used with atomics is initialized on the host which is
+//   significantly cheaper on some HW. On some HW this is faster overall than
+//   the non-atomic and atomic policies.
+// - *with_fences policies use normal memory accesses with device scope fences
+//                in the implementation. This works on all HW.
+// - *avoid_fences policies use special (atomic) memory accesses that only cache
+//                 in a cache shared by the whole device to avoid having to use
+//                 device scope fences. This improves performance on some HW but
+//                 is more difficult to code correctly.
 using hip_reduce_with_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
-
+///
 using hip_reduce_avoid_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
-
+///
 using hip_reduce_atomic_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
-
+///
 using hip_reduce_atomic_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
-
+///
 using hip_reduce_atomic_host_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
-
+///
 using hip_reduce_atomic_host_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
 
+// Policy for RAJA::Reduce* objects that gives the same answer every time when
+// used in the same way
 using hip_reduce = hip_reduce_avoid_fences;
 
+// Policy for RAJA::Reduce* objects that may use atomics and may not give the
+// same answer every time when used in the same way
 using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences;
 
 
From e9ef4c42007f21f5177f317ca14752b58642925e Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 17:29:13 -0700
Subject: [PATCH 075/108] Use enums in cuda/hip reduce tuning policies

---
 include/RAJA/policy/cuda/policy.hpp | 71 +++++++++++++++++++++--------
 include/RAJA/policy/cuda/reduce.hpp | 30 ++++++++----
 include/RAJA/policy/hip/policy.hpp  | 71 +++++++++++++++++++++--------
 include/RAJA/policy/hip/reduce.hpp  | 29 ++++++++----
 4 files changed, 145 insertions(+), 56 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index c86822763b..3b534348cf 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -159,15 +159,28 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   }
 };
 
-template < size_t t_replication, size_t t_atomic_stride,
-           bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host >
+
+enum struct reduce_algorithm : int
+{
+  finalize_last_block,
+  init_first_block_finalize_block_atomic,
+  init_host_finalize_block_atomic
+};
+
+enum struct block_communication_mode : int
+{
+  device_fence,
+  avoid_device_fence
+};
+
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
 struct ReduceTuning
 {
+  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
   static constexpr size_t atomic_stride = t_atomic_stride;
-  static constexpr bool maybe_atomic = t_maybe_atomic;
-  static constexpr bool avoid_fences = t_avoid_fences;
-  static constexpr bool init_on_host = t_init_on_host;
 };
 
 }  // namespace cuda
@@ -271,14 +284,13 @@ struct cuda_atomic_explicit{};
  */
 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
-template < bool maybe_atomic,
+
+template < RAJA::cuda::reduce_algorithm algorithm,
+           RAJA::cuda::block_communication_mode comm_mode,
            size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified,
-           bool init_on_host = false,
-           bool avoid_fences = false >
-using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
-    replication, atomic_stride,
-    maybe_atomic, avoid_fences, init_on_host> >;
+           size_t atomic_stride = named_usage::unspecified >
+using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
+    algorithm, comm_mode, replication, atomic_stride> >;
 
 // Policies for RAJA::Reduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results and falls back
@@ -297,17 +309,35 @@ using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
 //                 in a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_with_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
+using cuda_reduce_with_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::finalize_last_block,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_avoid_fences = cuda_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
+using cuda_reduce_avoid_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::finalize_last_block,
+    RAJA::cuda::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
+using cuda_reduce_atomic_with_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
+using cuda_reduce_atomic_avoid_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic,
+    RAJA::cuda::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_with_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+using cuda_reduce_atomic_host_with_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic,
+    RAJA::cuda::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
+using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic,
+    RAJA::cuda::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -317,6 +347,11 @@ using cuda_reduce = cuda_reduce_with_fences;
 // same answer every time when used in the same way
 using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences;
 
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool maybe_atomic >
+using cuda_reduce_base = std::conditional_t<maybe_atomic, cuda_reduce_atomic, cuda_reduce>;
+
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 698a259375..59ea86308f 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -877,6 +877,7 @@ struct ReduceAtomic_Data
   }
 };
 
+
 //! Cuda Reduction entity -- generalize on reduction, and type
 template <typename Combiner, typename T, typename tuning>
 class Reduce
@@ -890,19 +891,28 @@ class Reduce
         ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
-  static constexpr bool use_atomic = tuning::maybe_atomic &&
-      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
-
-  using Accessor = std::conditional_t<tuning::avoid_fences,
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
       impl::AccessorAvoidingFences,
-      impl::AccessorWithFences>;
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorWithFences,
+        void>>;
+
+  static constexpr bool atomic_policy =
+      (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) ||
+      (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic);
+  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<use_atomic,
-      std::conditional_t<tuning::init_on_host,
-        cuda::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
-        cuda::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>>,
-      cuda::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>>;
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      cuda::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic),
+          cuda::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic),
+            cuda::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index b63a8690ad..1501a6dc35 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -154,15 +154,28 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
   }
 };
 
-template < size_t t_replication, size_t t_atomic_stride,
-           bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host >
+
+enum struct reduce_algorithm : int
+{
+  finalize_last_block,
+  init_first_block_finalize_block_atomic,
+  init_host_finalize_block_atomic
+};
+
+enum struct block_communication_mode : int
+{
+  device_fence,
+  avoid_device_fence
+};
+
+template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
+           size_t t_replication, size_t t_atomic_stride >
 struct ReduceTuning
 {
+  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr block_communication_mode comm_mode = t_comm_mode;
   static constexpr size_t replication = t_replication;
   static constexpr size_t atomic_stride = t_atomic_stride;
-  static constexpr bool maybe_atomic = t_maybe_atomic;
-  static constexpr bool avoid_fences = t_avoid_fences;
-  static constexpr bool init_on_host = t_init_on_host;
 };
 
 }  // namespace hip
@@ -263,14 +276,13 @@ struct hip_atomic_explicit{};
  */
 using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
-template < bool maybe_atomic,
+
+template < RAJA::hip::reduce_algorithm algorithm,
+           RAJA::hip::block_communication_mode comm_mode,
            size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified,
-           bool init_on_host = false,
-           bool avoid_fences = false >
-using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning<
-    replication, atomic_stride,
-    maybe_atomic, avoid_fences, init_on_host> >;
+           size_t atomic_stride = named_usage::unspecified >
+using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning<
+    algorithm, comm_mode, replication, atomic_stride> >;
 
 // Policies for RAJA::Reduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results and falls back
@@ -289,17 +301,35 @@ using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning<
 //                 in a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_with_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, false>;
+using hip_reduce_with_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::finalize_last_block,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_avoid_fences = hip_reduce_base<false, named_usage::unspecified, named_usage::unspecified, false, true>;
+using hip_reduce_avoid_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::finalize_last_block,
+    RAJA::hip::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, false>;
+using hip_reduce_atomic_with_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, false, true>;
+using hip_reduce_atomic_avoid_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic,
+    RAJA::hip::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_with_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, false>;
+using hip_reduce_atomic_host_with_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic,
+    RAJA::hip::block_communication_mode::device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_avoid_fences = hip_reduce_base<true, named_usage::unspecified, named_usage::unspecified, true, true>;
+using hip_reduce_atomic_host_avoid_fences = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic,
+    RAJA::hip::block_communication_mode::avoid_device_fence,
+    named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -309,6 +339,11 @@ using hip_reduce = hip_reduce_avoid_fences;
 // same answer every time when used in the same way
 using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences;
 
+// Policy for RAJA::Reduce* objects that lets you select the default atomic or
+// non-atomic policy with a bool
+template < bool maybe_atomic >
+using hip_reduce_base = std::conditional_t<maybe_atomic, hip_reduce_atomic, hip_reduce>;
+
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 9b8c625c22..c8793d5102 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -886,19 +886,28 @@ class Reduce
         ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
-  static constexpr bool use_atomic = tuning::maybe_atomic &&
-      RAJA::reduce::hip::hip_atomic_available<T>::value;
-
-  using Accessor = std::conditional_t<tuning::avoid_fences,
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
       impl::AccessorAvoidingFences,
-      impl::AccessorWithFences>;
+      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
+        impl::AccessorWithFences,
+        void>>;
+
+  static constexpr bool atomic_policy =
+      (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) ||
+      (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic);
+  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<use_atomic,
-      std::conditional_t<tuning::init_on_host,
-        hip::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
-        hip::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>>,
-      hip::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>>;
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) ||
+                                              (atomic_policy && !atomic_available),
+      hip::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      std::conditional_t<atomic_available,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic),
+          hip::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic),
+            hip::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+            void>>,
+        void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 

From c946dd332a967c014884a0597d1c906e423b5319 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 20:35:58 -0700
Subject: [PATCH 076/108] update cuda/hip simple allocator comments

---
 include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 8 ++++----
 include/RAJA/policy/hip/MemUtils_HIP.hpp   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 5a66aff20e..43d927acab 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -61,7 +61,7 @@ struct PinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFreeHost(ptr));
@@ -80,7 +80,7 @@ struct DeviceAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -103,7 +103,7 @@ struct DeviceZeroedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
@@ -127,7 +127,7 @@ struct DevicePinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     cudaErrchk(cudaFree(ptr));
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index 63a8c9911c..84c6d1fa38 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -63,7 +63,7 @@ struct PinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipHostFree(ptr));
@@ -82,7 +82,7 @@ struct DeviceAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipFree(ptr));
@@ -105,7 +105,7 @@ struct DeviceZeroedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipFree(ptr));
@@ -124,7 +124,7 @@ struct DevicePinnedAllocator {
     return ptr;
   }
 
-  // returns true on success, false on failure
+  // returns true on success, throws a run time error exception on failure
   bool free(void* ptr)
   {
     hipErrchk(hipFree(ptr));

From 8fbbf48dc3a38466759c318ed5c396db052dc9e2 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Fri, 19 Apr 2024 21:22:16 -0700
Subject: [PATCH 077/108] Rename Accessor types

---
 include/RAJA/policy/cuda/intrinsics.hpp | 35 ++++++++++++++++++-------
 include/RAJA/policy/cuda/reduce.hpp     |  4 +--
 include/RAJA/policy/hip/intrinsics.hpp  | 35 ++++++++++++++++++-------
 include/RAJA/policy/hip/reduce.hpp      |  4 +--
 4 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index 053d7ab50e..c908046cac 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -46,9 +46,18 @@ namespace impl
 {
 
 /*!
- * \brief Abstracts access to memory using normal memory accesses.
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This uses device scope fences to ensure ordering and to flush local
+ *       caches so that memory accesses become visible to the whole device.
+ * \Note This class uses normal memory accesses that are cached in local caches
+ *       so device scope fences are required to make memory accesses visible
+ *       to the whole device.
  */
-struct AccessorWithFences : RAJA::detail::DefaultAccessor
+struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
 {
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
@@ -64,18 +73,24 @@ struct AccessorWithFences : RAJA::detail::DefaultAccessor
 /*!
  ******************************************************************************
  *
- * \brief Abstracts access to memory using atomic memory accesses.
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
  *
- * \Note Memory access through this class does not guarantee safe access to a
- *       value that is accessed concurrently by other threads as it may split
- *       memory operations into multiple atomic instructions.
- * \Note Fences used through this class only guarantee ordering, they do not
- *       guarantee visiblity of non-atomic memory operations as it may not
- *       actually flush the cache.
+ * \Note This may use block scope fences to ensure ordering and avoid flushing
+ *       local caches so special memory accesses are used to ensure visibility
+ *       to the whole device.
+ * \Note This class uses device scope atomic memory accesses to bypass local
+ *       caches so memory accesses are visible to the whole device without
+ *       device scope fences.
+ * \Note A memory access may be split into multiple memory accesses, so
+ *       even though atomic instructions are used concurrent accesses between
+ *       different threads are not thread safe.
  *
  ******************************************************************************
  */
-struct AccessorAvoidingFences
+struct AccessorDeviceScopeUseSharedCache
 {
   // cuda has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 59ea86308f..37e266b94a 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -892,9 +892,9 @@ class Reduce
         : 1);
 
   using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
-      impl::AccessorAvoidingFences,
+      impl::AccessorDeviceScopeUseSharedCache,
       std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorWithFences,
+        impl::AccessorDeviceScopeUseLocalCache,
         void>>;
 
   static constexpr bool atomic_policy =
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index fe3ac0f35d..c36f609f90 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -46,9 +46,18 @@ namespace impl
 {
 
 /*!
- * \brief Abstracts access to memory using normal memory accesses.
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
+ *
+ * \Note This uses device scope fences to ensure ordering and to flush local
+ *       caches so that memory accesses become visible to the whole device.
+ * \Note This class uses normal memory accesses that are cached in local caches
+ *       so device scope fences are required to make memory accesses visible
+ *       to the whole device.
  */
-struct AccessorWithFences : RAJA::detail::DefaultAccessor
+struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
 {
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
@@ -64,18 +73,24 @@ struct AccessorWithFences : RAJA::detail::DefaultAccessor
 /*!
  ******************************************************************************
  *
- * \brief Abstracts access to memory using atomic memory accesses.
+ * \brief Abstracts access to memory when coordinating between threads at
+ *       device scope. The fences provided here are to be used with relaxed
+ *       atomics in order to guarantee memory ordering and visibility of the
+ *       accesses done through this class.
  *
- * \Note Memory access through this class does not guarantee safe access to a
- *       value that is accessed concurrently by other threads as it may split
- *       memory operations into multiple atomic instructions.
- * \Note Fences used through this class only guarantee ordering, they do not
- *       guarantee visiblity of non-atomic memory operations as it may not
- *       actually flush the cache.
+ * \Note This may use block scope fences to ensure ordering and avoid flushing
+ *       local caches so special memory accesses are used to ensure visibility
+ *       to the whole device.
+ * \Note This class uses device scope atomic memory accesses to bypass local
+ *       caches so memory accesses are visible to the whole device without
+ *       device scope fences.
+ * \Note A memory access may be split into multiple memory accesses, so
+ *       even though atomic instructions are used concurrent accesses between
+ *       different threads are not thread safe.
  *
  ******************************************************************************
  */
-struct AccessorAvoidingFences
+struct AccessorDeviceScopeUseSharedCache
 {
   // hip has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index c8793d5102..140f01eabf 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -887,9 +887,9 @@ class Reduce
         : 1);
 
   using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
-      impl::AccessorAvoidingFences,
+      impl::AccessorDeviceScopeUseSharedCache,
       std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorWithFences,
+        impl::AccessorDeviceScopeUseLocalCache,
         void>>;
 
   static constexpr bool atomic_policy =

From 7f519cbab869e3902b591bcb23714698b92c8094 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 29 Apr 2024 09:44:58 -0700
Subject: [PATCH 078/108] Increase lassen ci time

---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index e6da7cecbf..cee458cd60 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -56,7 +56,7 @@ variables:
 # Lassen and Butte use a different job scheduler (spectrum lsf) that does not
 # allow pre-allocation the same way slurm does.
 # Arguments for job level allocation
-  LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
+  LASSEN_JOB_ALLOC: "1 -W 40 -q pci"
 # Project specific variants for lassen
   PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
 # Project specific deps for lassen

From 2f9ce110a7b5ae141586139da2b11cca94bca2b2 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Mon, 29 Apr 2024 13:24:49 -0700
Subject: [PATCH 079/108] Don't specify T in atomic reduce helpers

---
 include/RAJA/policy/cuda/reduce.hpp | 10 +++++-----
 include/RAJA/policy/hip/reduce.hpp  | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 37e266b94a..ccdfe43e63 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -69,7 +69,7 @@ template <typename T>
 struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -77,7 +77,7 @@ template <typename T>
 struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -85,7 +85,7 @@ template <typename T>
 struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -93,7 +93,7 @@ template <typename T>
 struct atomic<and_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
@@ -101,7 +101,7 @@ template <typename T>
 struct atomic<or_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr<T>(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
   }
 };
 
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 140f01eabf..2258340b52 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -63,7 +63,7 @@ template <typename T>
 struct atomic<sum<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -71,7 +71,7 @@ template <typename T>
 struct atomic<min<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -79,7 +79,7 @@ template <typename T>
 struct atomic<max<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -87,7 +87,7 @@ template <typename T>
 struct atomic<and_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
   }
 };
 
@@ -95,7 +95,7 @@ template <typename T>
 struct atomic<or_bit<T>> {
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr<T>(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
   }
 };
 

From 780ecdc2f03a2d02e74a5fd8f21388fa542a64a1 Mon Sep 17 00:00:00 2001
From: Jason Burmark <MrBurmark@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:46:23 -0700
Subject: [PATCH 080/108] Apply suggestions from code review

Co-authored-by: Robert Chen <chen59@llnl.gov>
---
 include/RAJA/util/SoAPtr.hpp                              | 2 +-
 test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp | 2 +-
 test/unit/algorithm/tests/test-algorithm-util-reduce.hpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 00a2fce111..47802d8f0a 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -46,7 +46,7 @@ template <typename T,
 class SoAPtr
 {
   template < typename, typename, typename >
-  friend class SoAPtr; // fiend other instantiations of this class
+  friend class SoAPtr; // friend other instantiations of this class
 
 public:
   using value_type = T;
diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
index 5277a07684..4e3f9fb795 100644
--- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //
diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
index f2cb0dda8d..062e0f9b91 100644
--- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
+++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp
@@ -1,5 +1,5 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC.
+// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC.
 //
 // Produced at the Lawrence Livermore National Laboratory
 //

From 2bb0b8f5b17f3f7137679b4343256bc1cd5d395e Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 30 Apr 2024 17:36:35 -0700
Subject: [PATCH 081/108] Rename rec_for_reduce polices to reduce_default

---
 include/RAJA/policy/cuda/policy.hpp | 18 +++++++++---------
 include/RAJA/policy/hip/policy.hpp  | 10 +++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index 3b534348cf..fb1a2f90ae 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1062,7 +1062,7 @@ using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConc
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
-using CudaRecForReduceConcretizer = CudaMaxOccupancyConcretizer;
+using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
@@ -1189,24 +1189,24 @@ using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
     Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit<
+using cuda_exec_reduce_default_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaRecForReduceConcretizer, BLOCKS_PER_SM, Async>;
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_rec_for_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_reduce_default_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaRecForReduceConcretizer, BLOCKS_PER_SM, true>;
+    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_rec_for_reduce = policy::cuda::cuda_exec_explicit<
+using cuda_exec_reduce_default = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_rec_for_reduce_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_reduce_default_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 
 // policies usable with WorkGroup
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 1501a6dc35..d1985ce667 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1058,7 +1058,7 @@ using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcre
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipRecForReduceConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
@@ -1125,14 +1125,14 @@ using hip_exec_occ_custom_async = policy::hip::hip_exec<
     Concretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_rec_for_reduce = policy::hip::hip_exec<
+using hip_exec_reduce_default = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipRecForReduceConcretizer, Async>;
+    HipReduceDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_rec_for_reduce_async = policy::hip::hip_exec<
+using hip_exec_reduce_default_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipRecForReduceConcretizer, true>;
+    HipReduceDefaultConcretizer, true>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;

From ebaeaf49f5eae03054222a553081252eaa1bef64 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 30 Apr 2024 17:40:37 -0700
Subject: [PATCH 082/108] Rename reducer policies

---
 include/RAJA/policy/cuda/intrinsics.hpp |  4 +-
 include/RAJA/policy/cuda/policy.hpp     | 58 +++++++++++-----------
 include/RAJA/policy/cuda/reduce.hpp     | 64 ++++++++++++-------------
 include/RAJA/policy/hip/intrinsics.hpp  |  4 +-
 include/RAJA/policy/hip/policy.hpp      | 58 +++++++++++-----------
 include/RAJA/policy/hip/reduce.hpp      | 64 ++++++++++++-------------
 test/include/RAJA_test-reducepol.hpp    | 24 +++++-----
 7 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index c908046cac..b0d2ea7cf1 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -57,7 +57,7 @@ namespace impl
  *       so device scope fences are required to make memory accesses visible
  *       to the whole device.
  */
-struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
@@ -90,7 +90,7 @@ struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
  *
  ******************************************************************************
  */
-struct AccessorDeviceScopeUseSharedCache
+struct AccessorDeviceScopeUseBlockFence
 {
   // cuda has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index fb1a2f90ae..a2aff97373 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -162,15 +162,15 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 
 enum struct reduce_algorithm : int
 {
-  finalize_last_block,
-  init_first_block_finalize_block_atomic,
-  init_host_finalize_block_atomic
+  combine_last_block,
+  init_device_combine_atomic_block,
+  init_host_combine_atomic_block
 };
 
 enum struct block_communication_mode : int
 {
   device_fence,
-  avoid_device_fence
+  block_fence
 };
 
 template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
@@ -303,49 +303,49 @@ using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning<
 //   the memory used with atomics is initialized on the host which is
 //   significantly cheaper on some HW. On some HW this is faster overall than
 //   the non-atomic and atomic policies.
-// - *with_fences policies use normal memory accesses with device scope fences
+// - *device_fence policies use normal memory accesses with device scope fences
 //                in the implementation. This works on all HW.
-// - *avoid_fences policies use special (atomic) memory accesses that only cache
+// - *block_fence policies use special (atomic) memory accesses that only cache
 //                 in a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_with_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::finalize_last_block,
+using cuda_reduce_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::combine_last_block,
     RAJA::cuda::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_avoid_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::finalize_last_block,
-    RAJA::cuda::block_communication_mode::avoid_device_fence,
+using cuda_reduce_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::combine_last_block,
+    RAJA::cuda::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_with_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic,
+using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
     RAJA::cuda::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_avoid_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic,
-    RAJA::cuda::block_communication_mode::avoid_device_fence,
+using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_with_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic,
+using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
     RAJA::cuda::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_tuning<
-    RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic,
-    RAJA::cuda::block_communication_mode::avoid_device_fence,
+using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
+    RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::cuda::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
-using cuda_reduce = cuda_reduce_with_fences;
+using cuda_reduce = cuda_reduce_device_fence;
 
 // Policy for RAJA::Reduce* objects that may use atomics and may not give the
 // same answer every time when used in the same way
-using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences;
+using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
@@ -1229,12 +1229,12 @@ using policy::cuda::cuda_atomic;
 using policy::cuda::cuda_atomic_explicit;
 
 // policies usable with reducers
-using policy::cuda::cuda_reduce_with_fences;
-using policy::cuda::cuda_reduce_avoid_fences;
-using policy::cuda::cuda_reduce_atomic_with_fences;
-using policy::cuda::cuda_reduce_atomic_avoid_fences;
-using policy::cuda::cuda_reduce_atomic_host_with_fences;
-using policy::cuda::cuda_reduce_atomic_host_avoid_fences;
+using policy::cuda::cuda_reduce_device_fence;
+using policy::cuda::cuda_reduce_block_fence;
+using policy::cuda::cuda_reduce_atomic_device_init_device_fence;
+using policy::cuda::cuda_reduce_atomic_device_init_block_fence;
+using policy::cuda::cuda_reduce_atomic_host_init_device_fence;
+using policy::cuda::cuda_reduce_atomic_host_init_block_fence;
 using policy::cuda::cuda_reduce_base;
 using policy::cuda::cuda_reduce;
 using policy::cuda::cuda_reduce_atomic;
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index ccdfe43e63..516b02383c 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -127,7 +127,7 @@ namespace impl
 template <typename Combiner, typename Accessor,
           int replication, int atomic_stride,
           typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
                                         T identity,
                                         TempIterator in_device_mem,
                                         unsigned int* device_count)
@@ -328,7 +328,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 //  returns true if put reduced value in val
 template <typename Combiner, typename Accessor,
           int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                                                T identity,
                                                T* device_mem,
                                                unsigned int* device_count)
@@ -391,7 +391,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val,
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
                                                             T identity,
                                                             T* device_mem)
 {
@@ -604,7 +604,7 @@ class PinnedTally
 //! pointer
 template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct Reduce_Data
+struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
@@ -618,14 +618,14 @@ struct Reduce_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  Reduce_Data() : Reduce_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
 
   /*! \brief create from a default value and offload information
    *
    *  allocates PinnedTally to hold device values
    */
 
-  Reduce_Data(T initValue, T identity_)
+  ReduceLastBlock_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -635,7 +635,7 @@ struct Reduce_Data
   }
 
   RAJA_HOST_DEVICE
-  Reduce_Data(const Reduce_Data& other)
+  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -644,7 +644,7 @@ struct Reduce_Data
   {
   }
 
-  Reduce_Data& operator=(const Reduce_Data&) = default;
+  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -662,7 +662,7 @@ struct Reduce_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce<
+    size_t replicationId = impl::grid_reduce_last_block<
         Combiner, Accessor, replication, atomic_stride>(
           temp, identity, device, device_count);
     if (replicationId != replication) {
@@ -705,7 +705,7 @@ struct Reduce_Data
 //! Reduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomicInitialized_Data
+struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
 
@@ -716,9 +716,9 @@ struct ReduceAtomicInitialized_Data
   bool is_setup;
   bool owns_device_pointer;
 
-  ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
 
-  ReduceAtomicInitialized_Data(T initValue, T identity_)
+  ReduceAtomicHostInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         is_setup{false},
@@ -727,7 +727,7 @@ struct ReduceAtomicInitialized_Data
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other)
+  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         is_setup{other.is_setup},
@@ -735,7 +735,7 @@ struct ReduceAtomicInitialized_Data
   {
   }
 
-  ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -753,7 +753,7 @@ struct ReduceAtomicInitialized_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_initialized<Combiner,
+    impl::grid_reduce_atomic_host_init<Combiner,
         replication, atomic_stride>(
             temp, identity, output);
   }
@@ -786,7 +786,7 @@ struct ReduceAtomicInitialized_Data
 //! Reduction data for Cuda Offload -- stores value, host pointer
 template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomic_Data
+struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
@@ -800,9 +800,9 @@ struct ReduceAtomic_Data
   T* device;
   bool owns_device_pointer;
 
-  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
 
-  ReduceAtomic_Data(T initValue, T identity_)
+  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -812,7 +812,7 @@ struct ReduceAtomic_Data
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomic_Data(const ReduceAtomic_Data& other)
+  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -821,7 +821,7 @@ struct ReduceAtomic_Data
   {
   }
 
-  ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -839,7 +839,7 @@ struct ReduceAtomic_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic<
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
         Combiner, Accessor, replication, atomic_stride>(
           temp, identity, device, device_count);
     if (replicationId != replication) {
@@ -891,26 +891,26 @@ class Reduce
         ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
-      impl::AccessorDeviceScopeUseSharedCache,
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+      impl::AccessorDeviceScopeUseBlockFence,
       std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseLocalCache,
+        impl::AccessorDeviceScopeUseDeviceFence,
         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) ||
-      (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic);
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
   static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) ||
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
                                               (atomic_policy && !atomic_available),
-      cuda::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
       std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic),
-          cuda::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic),
-            cuda::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
             void>>,
         void>>;
 
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c36f609f90..354e5d7278 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -57,7 +57,7 @@ namespace impl
  *       so device scope fences are required to make memory accesses visible
  *       to the whole device.
  */
-struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
+struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
@@ -90,7 +90,7 @@ struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor
  *
  ******************************************************************************
  */
-struct AccessorDeviceScopeUseSharedCache
+struct AccessorDeviceScopeUseBlockFence
 {
   // hip has 32 and 64 bit atomics
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index d1985ce667..df0995f59c 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -157,15 +157,15 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer
 
 enum struct reduce_algorithm : int
 {
-  finalize_last_block,
-  init_first_block_finalize_block_atomic,
-  init_host_finalize_block_atomic
+  combine_last_block,
+  init_device_combine_atomic_block,
+  init_host_combine_atomic_block
 };
 
 enum struct block_communication_mode : int
 {
   device_fence,
-  avoid_device_fence
+  block_fence
 };
 
 template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
@@ -295,49 +295,49 @@ using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning<
 //   the memory used with atomics is initialized on the host which is
 //   significantly cheaper on some HW. On some HW this is faster overall than
 //   the non-atomic and atomic policies.
-// - *with_fences policies use normal memory accesses with device scope fences
+// - *device_fence policies use normal memory accesses with device scope fences
 //                in the implementation. This works on all HW.
-// - *avoid_fences policies use special (atomic) memory accesses that only cache
+// - *block_fence policies use special (atomic) memory accesses that only cache
 //                 in a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_with_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::finalize_last_block,
+using hip_reduce_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::combine_last_block,
     RAJA::hip::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_avoid_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::finalize_last_block,
-    RAJA::hip::block_communication_mode::avoid_device_fence,
+using hip_reduce_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::combine_last_block,
+    RAJA::hip::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_with_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic,
+using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
     RAJA::hip::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_avoid_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic,
-    RAJA::hip::block_communication_mode::avoid_device_fence,
+using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_device_combine_atomic_block,
+    RAJA::hip::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_with_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic,
+using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
     RAJA::hip::block_communication_mode::device_fence,
     named_usage::unspecified, named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_avoid_fences = hip_reduce_tuning<
-    RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic,
-    RAJA::hip::block_communication_mode::avoid_device_fence,
+using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
+    RAJA::hip::reduce_algorithm::init_host_combine_atomic_block,
+    RAJA::hip::block_communication_mode::block_fence,
     named_usage::unspecified, named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
-using hip_reduce = hip_reduce_avoid_fences;
+using hip_reduce = hip_reduce_block_fence;
 
 // Policy for RAJA::Reduce* objects that may use atomics and may not give the
 // same answer every time when used in the same way
-using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences;
+using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
@@ -1147,12 +1147,12 @@ using policy::hip::hip_atomic;
 using policy::hip::hip_atomic_explicit;
 
 // policies usable with reducers
-using policy::hip::hip_reduce_with_fences;
-using policy::hip::hip_reduce_avoid_fences;
-using policy::hip::hip_reduce_atomic_with_fences;
-using policy::hip::hip_reduce_atomic_avoid_fences;
-using policy::hip::hip_reduce_atomic_host_with_fences;
-using policy::hip::hip_reduce_atomic_host_avoid_fences;
+using policy::hip::hip_reduce_device_fence;
+using policy::hip::hip_reduce_block_fence;
+using policy::hip::hip_reduce_atomic_device_init_device_fence;
+using policy::hip::hip_reduce_atomic_device_init_block_fence;
+using policy::hip::hip_reduce_atomic_host_init_device_fence;
+using policy::hip::hip_reduce_atomic_host_init_block_fence;
 using policy::hip::hip_reduce_base;
 using policy::hip::hip_reduce;
 using policy::hip::hip_reduce_atomic;
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index 2258340b52..2dbaf9f7e5 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -121,7 +121,7 @@ namespace impl
 template <typename Combiner, typename Accessor,
           int replication, int atomic_stride,
           typename T, typename TempIterator>
-RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
                                         T identity,
                                         TempIterator in_device_mem,
                                         unsigned int* device_count)
@@ -323,7 +323,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer<OP, T>& red
 template <typename Combiner, typename Accessor,
           int replication, int atomic_stride,
           typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                                                T identity,
                                                T* device_mem,
                                                unsigned int* device_count)
@@ -386,7 +386,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val,
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
 template <typename Combiner, int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val,
+RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
                                                             T identity,
                                                             T* device_mem)
 {
@@ -600,7 +600,7 @@ class PinnedTally
 //! pointer
 template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct Reduce_Data
+struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
@@ -614,14 +614,14 @@ struct Reduce_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  Reduce_Data() : Reduce_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
 
   /*! \brief create from a default value and offload information
    *
    *  allocates PinnedTally to hold device values
    */
 
-  Reduce_Data(T initValue, T identity_)
+  ReduceLastBlock_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -631,7 +631,7 @@ struct Reduce_Data
   }
 
   RAJA_HOST_DEVICE
-  Reduce_Data(const Reduce_Data& other)
+  ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -640,7 +640,7 @@ struct Reduce_Data
   {
   }
 
-  Reduce_Data& operator=(const Reduce_Data&) = default;
+  ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -657,7 +657,7 @@ struct Reduce_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce<
+    size_t replicationId = impl::grid_reduce_last_block<
         Combiner, Accessor, replication, atomic_stride>(
           temp, identity, device, device_count);
     if (replicationId != replication) {
@@ -701,7 +701,7 @@ struct Reduce_Data
 //! Reduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomicInitialized_Data
+struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
 
@@ -712,9 +712,9 @@ struct ReduceAtomicInitialized_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
 
-  ReduceAtomicInitialized_Data(T initValue, T identity_)
+  ReduceAtomicHostInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         is_setup{false},
@@ -723,7 +723,7 @@ struct ReduceAtomicInitialized_Data
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other)
+  ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         is_setup{other.is_setup},
@@ -731,7 +731,7 @@ struct ReduceAtomicInitialized_Data
   {
   }
 
-  ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -749,7 +749,7 @@ struct ReduceAtomicInitialized_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_initialized<Combiner, replication, atomic_stride>(
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
             temp, identity, output);
   }
 
@@ -781,7 +781,7 @@ struct ReduceAtomicInitialized_Data
 //! Reduction data for Hip Offload -- stores value, host pointer
 template <typename Combiner, typename Accessor, typename T,
           size_t replication, size_t atomic_stride>
-struct ReduceAtomic_Data
+struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
   using data_mempool_type = device_mempool_type;
@@ -795,9 +795,9 @@ struct ReduceAtomic_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
 
-  ReduceAtomic_Data(T initValue, T identity_)
+  ReduceAtomicDeviceInit_Data(T initValue, T identity_)
       : value{initValue},
         identity{identity_},
         device_count{nullptr},
@@ -807,7 +807,7 @@ struct ReduceAtomic_Data
   }
 
   RAJA_HOST_DEVICE
-  ReduceAtomic_Data(const ReduceAtomic_Data& other)
+  ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
       : value{other.identity},
         identity{other.identity},
         device_count{other.device_count},
@@ -816,7 +816,7 @@ struct ReduceAtomic_Data
   {
   }
 
-  ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
@@ -834,7 +834,7 @@ struct ReduceAtomic_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic<
+    size_t replicationId = impl::grid_reduce_atomic_device_init<
         Combiner, Accessor, replication, atomic_stride>(
           temp, identity, device, device_count);
     if (replicationId != replication) {
@@ -886,26 +886,26 @@ class Reduce
         ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
         : 1);
 
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence),
-      impl::AccessorDeviceScopeUseSharedCache,
+  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+      impl::AccessorDeviceScopeUseBlockFence,
       std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseLocalCache,
+        impl::AccessorDeviceScopeUseDeviceFence,
         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) ||
-      (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic);
+      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
   static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) ||
+  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
                                               (atomic_policy && !atomic_available),
-      hip::Reduce_Data<Combiner, Accessor, T, replication, atomic_stride>,
+      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
       std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic),
-          hip::ReduceAtomic_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic),
-            hip::ReduceAtomicInitialized_Data<Combiner, T, replication, atomic_stride>,
+        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
+          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
+          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
+            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
             void>>,
         void>>;
 
diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp
index cd97a686ca..e9e075b287 100644
--- a/test/include/RAJA_test-reducepol.hpp
+++ b/test/include/RAJA_test-reducepol.hpp
@@ -34,21 +34,21 @@ using OpenMPTargetReducePols =
 #endif
 
 #if defined(RAJA_ENABLE_CUDA)
-using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences,
-                                   RAJA::cuda_reduce_avoid_fences,
-                                   RAJA::cuda_reduce_atomic_with_fences,
-                                   RAJA::cuda_reduce_atomic_avoid_fences,
-                                   RAJA::cuda_reduce_atomic_host_with_fences,
-                                   RAJA::cuda_reduce_atomic_host_avoid_fences >;
+using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence,
+                                   RAJA::cuda_reduce_block_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_device_init_block_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_device_fence,
+                                   RAJA::cuda_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_HIP)
-using HipReducePols = camp::list< RAJA::hip_reduce_with_fences,
-                                  RAJA::hip_reduce_avoid_fences,
-                                  RAJA::hip_reduce_atomic_with_fences,
-                                  RAJA::hip_reduce_atomic_avoid_fences,
-                                  RAJA::hip_reduce_atomic_host_with_fences,
-                                  RAJA::hip_reduce_atomic_host_avoid_fences >;
+using HipReducePols = camp::list< RAJA::hip_reduce_device_fence,
+                                  RAJA::hip_reduce_block_fence,
+                                  RAJA::hip_reduce_atomic_device_init_device_fence,
+                                  RAJA::hip_reduce_atomic_device_init_block_fence,
+                                  RAJA::hip_reduce_atomic_host_init_device_fence,
+                                  RAJA::hip_reduce_atomic_host_init_block_fence >;
 #endif
 
 #if defined(RAJA_ENABLE_SYCL)

From 70523d2725a3ad77dfc32e4a128cdd249a7ada5e Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Tue, 30 Apr 2024 17:51:50 -0700
Subject: [PATCH 083/108] Update docs for renamings

---
 .../sphinx/user_guide/cook_book/reduction.rst |  6 +--
 docs/sphinx/user_guide/feature/policies.rst   | 44 ++++++++++---------
 2 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index e8925ee019..64fb172df7 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -52,15 +52,15 @@ RAJA uses policy types to specify how things are implemented.
 
 The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
 For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
-``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with
+``RAJA::cuda_exec_reduce_default<256>`` runs the loop as a CUDA GPU kernel with
 256 threads per block and other CUDA kernel launch parameters, like the
 number of blocks, optimized for performance with reducers.::
 
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
   // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
-  // using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>;
-  // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>;
+  // using exec_policy = RAJA::cuda_exec_reduce_default<256>;
+  // using exec_policy = RAJA::hip_exec_reduce_default<256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
 The reduction policy specifies how the reduction is done and must match the
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 9222af59c4..2b6c3574b8 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -280,15 +280,15 @@ policies have the prefix ``hip_``.
      Concretizer>                                        policy but the grid size
                                                          is determined by the
                                                          concretizer.
- cuda/hip_exec_rec_for_reduce<BLOCK_SIZE>  forall        The cuda/hip exec policy
-                                                         that is recommended for
-                                                         use with reducers. In general using
-                                                         the occupancy calculator policies
-                                                         are better but exactly how much
-                                                         occupancy to use differs by platform
-                                                         so this policy provides a simple way
-                                                         to get what works best for that platform
-                                                         without having to know the details.
+ cuda/hip_exec_reduce_default<BLOCK_SIZE>  forall        The cuda/hip exec policy that is
+                                                         recommended for use with reducers.
+                                                         In general using the occupancy
+                                                         calculator policies are better for
+                                                         reducers but exactly how much occupancy
+                                                         to use differs by platform so this policy
+                                                         provides a simple way to get what works
+                                                         best for a platform without having to
+                                                         know the details.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed
@@ -758,22 +758,26 @@ cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a C
                                          policy        (device synchronization will occur when
                                                        reduction value is finalized).
 cuda/hip_reduce\*atomic\*                any CUDA/HIP  Same as above, but reduction may use
-                                         policy        atomic operations and initializes the
-                                                       memory used for atomics on the device.
-                                                       This works on all architectures but
-                                                       incurs higher overheads.
-cuda/hip_reduce\*atomic_host\*           any CUDA/HIP  Same as above, but reduction may use
-                                         policy        atomic operations and initializes the
+                                         policy        atomic operations leading to run to run
+                                                       variability in the results.
+cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the
                                                        memory used for atomics on the host.
                                                        This works on recent architectures and
                                                        incurs lower overheads.
-cuda/hip_reduce\*with_fences             any CUDA/HIP  Same as above, and reduction uses normal
-                                         policy        memory accesses with device scope fences.
+cuda/hip_reduce\*device_init\*           any CUDA/HIP  Same as above, but initializes the
+                                                       memory used for atomics on the device.
                                                        This works on all architectures but
                                                        incurs higher overheads.
-cuda/hip_reduce\*avoid_fences            any CUDA/HIP  Same as above, and reduction uses special
-                                         policy        memory accesses to allow it to avoid
-                                                       device scope fences. This improves
+cuda/hip_reduce\*device_fence            any CUDA/HIP  Same as above, and reduction uses normal
+                                         policy        memory accesses that are not visible across
+                                                       the whole device and device scope fences
+                                                       to ensure visibility and ordering.
+                                                       This works on all architectures but
+                                                       incurs higher overheads on some architectures.
+cuda/hip_reduce\*block_fence             any CUDA/HIP  Same as above, and reduction uses special
+                                         policy        memory accesses to a level of cache shared
+                                                       visible to the whole device and block scope
+                                                       fences to ensure ordering. This improves
                                                        performance on some architectures.
 sycl_reduce                              any SYCL      Reduction in a SYCL kernel (device
                                          policy        synchronization will occur when the

From 1b79d608a0f58580b155da2136e174d528f916bb Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 1 May 2024 15:07:45 -0700
Subject: [PATCH 084/108] Use conditional_t in AsIntegerArray

---
 include/RAJA/util/types.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 8441f75522..7e331ef00e 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -899,31 +899,31 @@ struct AsIntegerArray
 {
   static_assert(min_integer_type_size <= max_integer_type_size,
                 "incompatible min and max integer type size");
-  using integer_type = typename std::conditional<
+  using integer_type = std::conditional_t<
       ((alignof(T) >= alignof(unsigned long long) &&
         sizeof(unsigned long long) <= max_integer_type_size) ||
        sizeof(unsigned long) < min_integer_type_size),
       unsigned long long,
-      typename std::conditional<
+      std::conditional_t<
           ((alignof(T) >= alignof(unsigned long) &&
             sizeof(unsigned long) <= max_integer_type_size) ||
            sizeof(unsigned int) < min_integer_type_size),
           unsigned long,
-          typename std::conditional<
+          std::conditional_t<
               ((alignof(T) >= alignof(unsigned int) &&
                 sizeof(unsigned int) <= max_integer_type_size) ||
                sizeof(unsigned short) < min_integer_type_size),
               unsigned int,
-              typename std::conditional<
+              std::conditional_t<
                   ((alignof(T) >= alignof(unsigned short) &&
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  typename std::conditional<
+                  std::conditional_t<
                       ((alignof(T) >= alignof(unsigned char) &&
                         sizeof(unsigned char) <= max_integer_type_size)),
                       unsigned char,
-                      void>::type>::type>::type>::type>::type;
+                      void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
                 "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,

From f6e62bc800fa8d137b04e7b2dbad7e14dc41ed74 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 1 May 2024 15:32:48 -0700
Subject: [PATCH 085/108] Rename to exec_with_reduce

---
 docs/sphinx/user_guide/cook_book/reduction.rst | 6 +++---
 docs/sphinx/user_guide/feature/policies.rst    | 2 +-
 include/RAJA/policy/cuda/policy.hpp            | 8 ++++----
 include/RAJA/policy/hip/policy.hpp             | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index 64fb172df7..a750ee149c 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -52,15 +52,15 @@ RAJA uses policy types to specify how things are implemented.
 
 The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
 For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
-``RAJA::cuda_exec_reduce_default<256>`` runs the loop as a CUDA GPU kernel with
+``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with
 256 threads per block and other CUDA kernel launch parameters, like the
 number of blocks, optimized for performance with reducers.::
 
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
   // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
-  // using exec_policy = RAJA::cuda_exec_reduce_default<256>;
-  // using exec_policy = RAJA::hip_exec_reduce_default<256>;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
 The reduction policy specifies how the reduction is done and must match the
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 2b6c3574b8..11d5aa5f05 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -280,7 +280,7 @@ policies have the prefix ``hip_``.
      Concretizer>                                        policy but the grid size
                                                          is determined by the
                                                          concretizer.
- cuda/hip_exec_reduce_default<BLOCK_SIZE>  forall        The cuda/hip exec policy that is
+ cuda/hip_exec_with_reduce<BLOCK_SIZE>     forall        The cuda/hip exec policy that is
                                                          recommended for use with reducers.
                                                          In general using the occupancy
                                                          calculator policies are better for
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index a2aff97373..ed6456e0fc 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1189,22 +1189,22 @@ using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
     Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_reduce_default_explicit = policy::cuda::cuda_exec_explicit<
+using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_reduce_default_explicit_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_reduce_default = policy::cuda::cuda_exec_explicit<
+using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
 template <size_t BLOCK_SIZE>
-using cuda_exec_reduce_default_async = policy::cuda::cuda_exec_explicit<
+using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index df0995f59c..3712eccbb9 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1125,12 +1125,12 @@ using hip_exec_occ_custom_async = policy::hip::hip_exec<
     Concretizer, true>;
 
 template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_reduce_default = policy::hip::hip_exec<
+using hip_exec_with_reduce = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipReduceDefaultConcretizer, Async>;
 
 template <size_t BLOCK_SIZE>
-using hip_exec_reduce_default_async = policy::hip::hip_exec<
+using hip_exec_with_reduce_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipReduceDefaultConcretizer, true>;
 

From 3143dda1509b184fdb4ae2baf15952d0022b2034 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 1 May 2024 16:47:32 -0700
Subject: [PATCH 086/108] Add cuda/hip_exec_base<with_reduce, ...>

This lets you choose between cuda/hip_exec and
cuda/hip_exec_with_reduce similarly to how
cuda/hip_reduce_base<maybe_atomic> lets you choose betwen
cuda/hip_reduce and cuda/hip_reduce_atomic
---
 .../sphinx/user_guide/cook_book/reduction.rst |  8 +++---
 docs/sphinx/user_guide/feature/policies.rst   | 26 ++++++++++++-------
 include/RAJA/policy/cuda/policy.hpp           | 20 ++++++++++++++
 include/RAJA/policy/hip/policy.hpp            | 10 +++++++
 4 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index a750ee149c..b025f8a549 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -59,8 +59,8 @@ number of blocks, optimized for performance with reducers.::
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
   // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
-  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
-  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>; // or RAJA::cuda_exec_base<true, 256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>; // or RAJA::hip_exec_base<true, 256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
 The reduction policy specifies how the reduction is done and must match the
@@ -72,8 +72,8 @@ data type, and can only be used with cuda execution policies. Similarly for othe
   using reduce_policy = RAJA::seq_reduce;
   // using reduce_policy = RAJA::omp_reduce;
   // using reduce_policy = RAJA::omp_target_reduce;
-  // using reduce_policy = RAJA::cuda_reduce_atomic;
-  // using reduce_policy = RAJA::hip_reduce_atomic;
+  // using reduce_policy = RAJA::cuda_reduce_atomic; // or RAJA::cuda_reduce_base<true>
+  // using reduce_policy = RAJA::hip_reduce_atomic; // or RAJA::hip_reduce_base<true>
   // using reduce_policy = RAJA::sycl_reduce;
 
 
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 11d5aa5f05..71291073ce 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -247,6 +247,18 @@ policies have the prefix ``hip_``.
                                                          Note that the thread-block
                                                          size must be provided,
                                                          there is no default.
+ cuda/hip_exec_with_reduce<BLOCK_SIZE>     forall        The cuda/hip exec policy that is
+                                                         recommended for use with reducers.
+                                                         In general using the occupancy
+                                                         calculator policies are better for
+                                                         reducers but exactly how much occupancy
+                                                         to use differs by platform so this policy
+                                                         provides a simple way to get what works
+                                                         best for a platform without having to
+                                                         know the details.
+ cuda/hip_exec_base<with_reduce,           forall        Choose between cuda/hip_exec and
+                    BLOCK_SIZE>                          cuda/hip_exec_with_reduce policies based on
+                                                         the with_reduce boolean.
  cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE> forall,       Execute loop iterations
                                                          mapped to global threads via
                                                          grid striding with multiple
@@ -280,15 +292,6 @@ policies have the prefix ``hip_``.
      Concretizer>                                        policy but the grid size
                                                          is determined by the
                                                          concretizer.
- cuda/hip_exec_with_reduce<BLOCK_SIZE>     forall        The cuda/hip exec policy that is
-                                                         recommended for use with reducers.
-                                                         In general using the occupancy
-                                                         calculator policies are better for
-                                                         reducers but exactly how much occupancy
-                                                         to use differs by platform so this policy
-                                                         provides a simple way to get what works
-                                                         best for a platform without having to
-                                                         know the details.
  cuda/hip_launch_t                         launch        Launches a device kernel,
                                                          any code expressed within
                                                          the lambda is executed
@@ -757,9 +760,12 @@ omp_target_reduce                        any OpenMP    OpenMP parallel target of
 cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
                                          policy        (device synchronization will occur when
                                                        reduction value is finalized).
-cuda/hip_reduce\*atomic\*                any CUDA/HIP  Same as above, but reduction may use
+cuda/hip_reduce_atomic                   any CUDA/HIP  Same as above, but reduction may use
                                          policy        atomic operations leading to run to run
                                                        variability in the results.
+cuda/hip_reduce_base<maybe_atomic>       any CUDA/HIP  Choose between cuda/hip_reduce and
+                                         policy        cuda/hip_reduce_atomic policies based on
+                                                       the maybe_atomic boolean.
 cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the
                                                        memory used for atomics on the host.
                                                        This works on recent architectures and
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index ed6456e0fc..d99a8c6c79 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1208,6 +1208,26 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
     iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
     CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
+template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    cuda_exec<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, BLOCKS_PER_SM>,
+    cuda_exec<BLOCK_SIZE, BLOCKS_PER_SM>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_base = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+    cuda_exec<BLOCK_SIZE, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE>
+using cuda_exec_base_async = std::conditional_t<with_reduce,
+    cuda_exec_with_reduce<BLOCK_SIZE>,
+    cuda_exec<BLOCK_SIZE>>;
+
 
 // policies usable with WorkGroup
 template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 3712eccbb9..5c9841aa8c 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1134,6 +1134,16 @@ using hip_exec_with_reduce_async = policy::hip::hip_exec<
     iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
     HipReduceDefaultConcretizer, true>;
 
+template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_base = std::conditional_t<with_reduce,
+    hip_exec_with_reduce<BLOCK_SIZE, Async>,
+    hip_exec<BLOCK_SIZE, Async>>;
+
+template <bool with_reduce, size_t BLOCK_SIZE>
+using hip_exec_base_async = std::conditional_t<with_reduce,
+    hip_exec_with_reduce<BLOCK_SIZE>,
+    hip_exec<BLOCK_SIZE>>;
+
 // policies usable with WorkGroup
 using policy::hip::hip_work;
 

From 7df3f554287f3417eb030337b65bcd66430c9577 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Wed, 1 May 2024 17:03:09 -0700
Subject: [PATCH 087/108] Add extra bit to cookbook for base policies

---
 .../sphinx/user_guide/cook_book/reduction.rst | 25 ++++++++++++++++---
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index b025f8a549..d1190e222a 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -59,8 +59,8 @@ number of blocks, optimized for performance with reducers.::
   using exec_policy = RAJA::seq_exec;
   // using exec_policy = RAJA::omp_parallel_for_exec;
   // using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
-  // using exec_policy = RAJA::cuda_exec_with_reduce<256>; // or RAJA::cuda_exec_base<true, 256>;
-  // using exec_policy = RAJA::hip_exec_with_reduce<256>; // or RAJA::hip_exec_base<true, 256>;
+  // using exec_policy = RAJA::cuda_exec_with_reduce<256>;
+  // using exec_policy = RAJA::hip_exec_with_reduce<256>;
   // using exec_policy = RAJA::sycl_exec<256>;
 
 The reduction policy specifies how the reduction is done and must match the
@@ -72,8 +72,8 @@ data type, and can only be used with cuda execution policies. Similarly for othe
   using reduce_policy = RAJA::seq_reduce;
   // using reduce_policy = RAJA::omp_reduce;
   // using reduce_policy = RAJA::omp_target_reduce;
-  // using reduce_policy = RAJA::cuda_reduce_atomic; // or RAJA::cuda_reduce_base<true>
-  // using reduce_policy = RAJA::hip_reduce_atomic; // or RAJA::hip_reduce_base<true>
+  // using reduce_policy = RAJA::cuda_reduce_atomic;
+  // using reduce_policy = RAJA::hip_reduce_atomic;
   // using reduce_policy = RAJA::sycl_reduce;
 
 
@@ -91,3 +91,20 @@ Here a simple sum reduction is performed using RAJA::
 The results of these operations will yield the following values:
 
  * vsum.get() == 1000
+
+
+Another option for the execution policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the general
+use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
+
+  // static constexpr bool with_reducers = ...;
+  // using exec_policy = RAJA::cuda_exec_base<with_reducers, 256>;
+  // using exec_policy = RAJA::hip_exec_base<with_reducers, 256>;
+
+Another option for the reduction policy when using the cuda or hip backends are
+the base policies which have a boolean parameter to choose between the atomic
+``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
+
+  // static constexpr bool maybe_atomic = ...;
+  // using reduce_policy = RAJA::cuda_reduce_base<maybe_atomic>;
+  // using reduce_policy = RAJA::hip_reduce_base<maybe_atomic>;

From a7d0b1bfe69a8620096ad21b63061bfc2a548ea8 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 10:15:40 -0700
Subject: [PATCH 088/108] Use with_atomic and with_reduce more consistently

---
 docs/sphinx/user_guide/cook_book/reduction.rst | 12 ++++++------
 docs/sphinx/user_guide/feature/policies.rst    |  4 ++--
 include/RAJA/policy/cuda/policy.hpp            |  4 ++--
 include/RAJA/policy/hip/policy.hpp             |  4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index d1190e222a..3ee2b479f2 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -97,14 +97,14 @@ Another option for the execution policy when using the cuda or hip backends are
 the base policies which have a boolean parameter to choose between the general
 use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
 
-  // static constexpr bool with_reducers = ...;
-  // using exec_policy = RAJA::cuda_exec_base<with_reducers, 256>;
-  // using exec_policy = RAJA::hip_exec_base<with_reducers, 256>;
+  // static constexpr bool with_reduce = ...;
+  // using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
+  // using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
 
 Another option for the reduction policy when using the cuda or hip backends are
 the base policies which have a boolean parameter to choose between the atomic
 ``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
 
-  // static constexpr bool maybe_atomic = ...;
-  // using reduce_policy = RAJA::cuda_reduce_base<maybe_atomic>;
-  // using reduce_policy = RAJA::hip_reduce_base<maybe_atomic>;
+  // static constexpr bool with_atomic = ...;
+  // using reduce_policy = RAJA::cuda_reduce_base<with_atomic>;
+  // using reduce_policy = RAJA::hip_reduce_base<with_atomic>;
diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 71291073ce..ec35367cee 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -763,9 +763,9 @@ cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a C
 cuda/hip_reduce_atomic                   any CUDA/HIP  Same as above, but reduction may use
                                          policy        atomic operations leading to run to run
                                                        variability in the results.
-cuda/hip_reduce_base<maybe_atomic>       any CUDA/HIP  Choose between cuda/hip_reduce and
+cuda/hip_reduce_base<with_atomic>       any CUDA/HIP  Choose between cuda/hip_reduce and
                                          policy        cuda/hip_reduce_atomic policies based on
-                                                       the maybe_atomic boolean.
+                                                       the with_atomic boolean.
 cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the
                                                        memory used for atomics on the host.
                                                        This works on recent architectures and
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index d99a8c6c79..ae510715ff 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -349,8 +349,8 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool maybe_atomic >
-using cuda_reduce_base = std::conditional_t<maybe_atomic, cuda_reduce_atomic, cuda_reduce>;
+template < bool with_atomic >
+using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 5c9841aa8c..7c965a3c54 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -341,8 +341,8 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool maybe_atomic >
-using hip_reduce_base = std::conditional_t<maybe_atomic, hip_reduce_atomic, hip_reduce>;
+template < bool with_atomic >
+using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // Policy for RAJA::statement::Reduce that reduces threads in a block

From 55ea1d353a6a4bd6840052ceb450303cf03bea82 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 10:20:13 -0700
Subject: [PATCH 089/108] Improve formatting of final values in reduce cookbook

---
 docs/sphinx/user_guide/cook_book/reduction.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst
index 3ee2b479f2..73843ebb40 100644
--- a/docs/sphinx/user_guide/cook_book/reduction.rst
+++ b/docs/sphinx/user_guide/cook_book/reduction.rst
@@ -46,7 +46,7 @@ Here a simple sum reduction is performed in a for loop::
 
 The results of these operations will yield the following values:
 
- * vsum == 1000
+ * ``vsum == 1000``
 
 RAJA uses policy types to specify how things are implemented.
 
@@ -90,7 +90,7 @@ Here a simple sum reduction is performed using RAJA::
 
 The results of these operations will yield the following values:
 
- * vsum.get() == 1000
+ * ``vsum.get() == 1000``
 
 
 Another option for the execution policy when using the cuda or hip backends are

From 9793ed9fb3809740c4e76bf647eb87c87b98d73b Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 10:30:08 -0700
Subject: [PATCH 090/108] fix spacing

---
 docs/sphinx/user_guide/feature/policies.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index ec35367cee..85d3ef475d 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -763,7 +763,7 @@ cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a C
 cuda/hip_reduce_atomic                   any CUDA/HIP  Same as above, but reduction may use
                                          policy        atomic operations leading to run to run
                                                        variability in the results.
-cuda/hip_reduce_base<with_atomic>       any CUDA/HIP  Choose between cuda/hip_reduce and
+cuda/hip_reduce_base<with_atomic>        any CUDA/HIP  Choose between cuda/hip_reduce and
                                          policy        cuda/hip_reduce_atomic policies based on
                                                        the with_atomic boolean.
 cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the

From f46adf9700d8f03b794471664cfd376ca8426e32 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 12:55:54 -0700
Subject: [PATCH 091/108] fix base exec policies

---
 include/RAJA/policy/cuda/policy.hpp | 12 ++++++------
 include/RAJA/policy/hip/policy.hpp  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index ae510715ff..84cd8a301c 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -1210,13 +1210,13 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_base_explicit = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-    cuda_exec<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
+    cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, BLOCKS_PER_SM>,
-    cuda_exec<BLOCK_SIZE, BLOCKS_PER_SM>>;
+    cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+    cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
 template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_base = std::conditional_t<with_reduce,
@@ -1225,8 +1225,8 @@ using cuda_exec_base = std::conditional_t<with_reduce,
 
 template <bool with_reduce, size_t BLOCK_SIZE>
 using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE>,
-    cuda_exec<BLOCK_SIZE>>;
+    cuda_exec_with_reduce_async<BLOCK_SIZE>,
+    cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index 7c965a3c54..c359a68de0 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -1141,8 +1141,8 @@ using hip_exec_base = std::conditional_t<with_reduce,
 
 template <bool with_reduce, size_t BLOCK_SIZE>
 using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE>,
-    hip_exec<BLOCK_SIZE>>;
+    hip_exec_with_reduce_async<BLOCK_SIZE>,
+    hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;

From f987c3948f7951606ee400a068062b4be59740b1 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 12:58:56 -0700
Subject: [PATCH 092/108] Try to fix tables in docs

---
 docs/sphinx/user_guide/feature/policies.rst | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 85d3ef475d..d418163c11 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -257,7 +257,7 @@ policies have the prefix ``hip_``.
                                                          best for a platform without having to
                                                          know the details.
  cuda/hip_exec_base<with_reduce,           forall        Choose between cuda/hip_exec and
-                    BLOCK_SIZE>                          cuda/hip_exec_with_reduce policies based on
+ BLOCK_SIZE>                                             cuda/hip_exec_with_reduce policies based on
                                                          the with_reduce boolean.
  cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE> forall,       Execute loop iterations
                                                          mapped to global threads via
@@ -285,11 +285,11 @@ policies have the prefix ``hip_``.
                                                          of the kernel for performance
                                                          reasons.
  cuda/hip_exec_occ_fraction<BLOCK_SIZE,    forall        Similar to the occ_max
-     RAJA::Fraction<size_t,                              policy but use a fraction
-        numerator, denominator>>                         of the maximum occupancy
+ RAJA::Fraction<size_t,                                  policy but use a fraction
+ numerator, denominator>>                                of the maximum occupancy
                                                          of the kernel.
  cuda/hip_exec_occ_custom<BLOCK_SIZE,      forall        Similar to the occ_max
-     Concretizer>                                        policy but the grid size
+ Concretizer>                                            policy but the grid size
                                                          is determined by the
                                                          concretizer.
  cuda/hip_launch_t                         launch        Launches a device kernel,
@@ -458,8 +458,8 @@ Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer    Avoids using the max occupan
                                                     occupancy of the device.
 
 Cuda/HipFractionOffsetOccupancyConcretizer<         Uses a fraction and offset to choose an
-    Fraction<size_t, numerator, denomenator>,       occupancy based on the max occupancy
-    BLOCKS_PER_SM_OFFSET>                           Using the following formula:
+Fraction<size_t, numerator, denomenator>,           occupancy based on the max occupancy
+BLOCKS_PER_SM_OFFSET>                               Using the following formula:
                                                     (Fraction * kernel_max_blocks_per_sm +
                                                      BLOCKS_PER_SM_OFFSET) * sm_per_device
 

From b4d4dce03b0c7072390f132c23f43fb3fe3b7c2a Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 13:39:17 -0700
Subject: [PATCH 093/108] convert gpu exec policy table into grid table

---
 docs/sphinx/user_guide/feature/policies.rst | 433 +++++++++++---------
 1 file changed, 238 insertions(+), 195 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index d418163c11..50301084ef 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -236,201 +236,244 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical.
 The only difference is that CUDA policies have the prefix ``cuda_`` and HIP
 policies have the prefix ``hip_``.
 
- ========================================= ============= =======================================
- CUDA/HIP Execution Policies               Works with    Brief description
- ========================================= ============= =======================================
- cuda/hip_exec<BLOCK_SIZE>                 forall,       Execute loop iterations
-                                           scan,         directly mapped to global threads
-                                           sort          in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and unbounded grid size.
-                                                         Note that the thread-block
-                                                         size must be provided,
-                                                         there is no default.
- cuda/hip_exec_with_reduce<BLOCK_SIZE>     forall        The cuda/hip exec policy that is
-                                                         recommended for use with reducers.
-                                                         In general using the occupancy
-                                                         calculator policies are better for
-                                                         reducers but exactly how much occupancy
-                                                         to use differs by platform so this policy
-                                                         provides a simple way to get what works
-                                                         best for a platform without having to
-                                                         know the details.
- cuda/hip_exec_base<with_reduce,           forall        Choose between cuda/hip_exec and
- BLOCK_SIZE>                                             cuda/hip_exec_with_reduce policies based on
-                                                         the with_reduce boolean.
- cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE> forall,       Execute loop iterations
-                                                         mapped to global threads via
-                                                         grid striding with multiple
-                                                         iterations per global thread
-                                                         in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and grid size.
-                                                         Note that the thread-block
-                                                         size and grid size must be
-                                                         provided, there is no default.
- cuda/hip_exec_occ_max<BLOCK_SIZE>         forall        Execute loop iterations
-                                                         mapped to global threads via
-                                                         grid striding with multiple
-                                                         iterations per global thread
-                                                         in a GPU kernel launched
-                                                         with given thread-block
-                                                         size and grid size bounded
-                                                         by the maximum occupancy of
-                                                         the kernel.
- cuda/hip_exec_occ_calc<BLOCK_SIZE>        forall        Similar to the occ_max
-                                                         policy but may use less
-                                                         than the maximum occupancy
-                                                         determined by the occupancy calculator
-                                                         of the kernel for performance
-                                                         reasons.
- cuda/hip_exec_occ_fraction<BLOCK_SIZE,    forall        Similar to the occ_max
- RAJA::Fraction<size_t,                                  policy but use a fraction
- numerator, denominator>>                                of the maximum occupancy
-                                                         of the kernel.
- cuda/hip_exec_occ_custom<BLOCK_SIZE,      forall        Similar to the occ_max
- Concretizer>                                            policy but the grid size
-                                                         is determined by the
-                                                         concretizer.
- cuda/hip_launch_t                         launch        Launches a device kernel,
-                                                         any code expressed within
-                                                         the lambda is executed
-                                                         on the device.
- cuda/hip_thread_x_direct                  kernel (For)  Map loop iterates
-                                           launch (loop) directly to GPU threads
-                                                         in x-dimension, one
-                                                         iterate per thread
-                                                         (see note below about
-                                                         limitations)
- cuda/hip_thread_y_direct                  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in y-dim
- cuda/hip_thread_z_direct                  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in z-dim
- cuda/hip_thread_x_loop                    kernel (For)  Similar to
-                                           launch (loop) thread-x-direct
-                                                         policy, but use a
-                                                         block-stride loop which
-                                                         doesn't limit number of
-                                                         loop iterates
- cuda/hip_thread_y_loop                    kernel (For)  Same as above, but for
-                                           launch (loop) threads in y-dimension
- cuda/hip_thread_z_loop                    kernel (For)  Same as above, but for
-                                           launch (loop) threads in z-dimension
- cuda/hip_thread_syncable_loop<dims...>    kernel (For)  Similar to thread-loop
-                                           launch (loop) policy, but safe to use
-                                                         with Cuda/HipSyncThreads
- cuda/hip_thread_size_x_direct<nxthreads>  kernel (For)  Same as thread_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time number of
-                                                         threads
- cuda/hip_thread_size_y_direct<nythreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in y-dim
- cuda/hip_thread_size_z_direct<nzthreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to threads in z-dim
- cuda/hip_flatten_threads_{xyz}_direct     launch (loop) Reshapes threads in a
-                                                         multi-dimensional thread
-                                                         team into one-dimension,
-                                                         accepts any permutation
-                                                         of dimensions
- cuda/hip_block_x_direct                   kernel (For)  Map loop iterates
-                                           launch (loop) directly to GPU thread
-                                                         blocks in x-dimension,
-                                                         one iterate per block
- cuda/hip_block_y_direct                   kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in y-dimension
- cuda/hip_block_z_direct                   kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in z-dimension
- cuda/hip_block_x_loop                     kernel (For)  Similar to
-                                           launch (loop) block-x-direct policy,
-                                                         but use a grid-stride
-                                                         loop.
- cuda/hip_block_y_loop                     kernel (For)  Same as above, but use
-                                           launch (loop) blocks in y-dimension
- cuda/hip_block_z_loop                     kernel (For)  Same as above, but use
-                                           launch (loop) blocks in z-dimension
- cuda/hip_block_size_x_direct<nxblocks>    kernel (For)  Same as block_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time number of
-                                                         blocks
- cuda/hip_block_size_y_direct<nyblocks>    kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in y-dim
- cuda/hip_block_size_z_direct<nzblocks>    kernel (For)  Same as above, but map
-                                           launch (loop) to blocks in z-dim
- cuda/hip_global_x_direct                  kernel (For)  Creates a unique thread
-                                           launch (loop) id for each thread on
-                                                         x-dimension of the grid.
-                                                         Same as computing
-                                                         threadIdx.x +
-                                                         threadDim.x * blockIdx.x.
- cuda/hip_global_y_direct                  kernel (For)  Same as above, but uses
-                                           launch (loop) globals in y-dimension.
- cuda/hip_global_z_direct                  kernel (For)  Same as above, but uses
-                                           launch (loop) globals in z-dimension.
- cuda/hip_global_x_loop                    kernel (For)  Similar to
-                                           launch (loop) global-x-direct policy,
-                                                         but use a grid-stride
-                                                         loop.
- cuda/hip_global_y_loop                    kernel (For)  Same as above, but use
-                                           launch (loop) globals in y-dimension
- cuda/hip_global_z_loop                    kernel (For)  Same as above, but use
-                                           launch (loop) globals in z-dimension
- cuda/hip_global_size_x_direct<nxthreads>  kernel (For)  Same as global_x_direct
-                                           launch (loop) policy above but with
-                                                         a compile time block
-                                                         size
- cuda/hip_global_size_y_direct<nythreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to globals in y-dim
- cuda/hip_global_size_z_direct<nzthreads>  kernel (For)  Same as above, but map
-                                           launch (loop) to globals in z-dim
- cuda/hip_warp_direct                      kernel (For)  Map work to threads
-                                                         in a warp directly.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can be
-                                                         created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_loop                        kernel (For)  Policy to map work to
-                                                         threads in a warp using
-                                                         a warp-stride loop.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can be
-                                                         created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_masked_direct<BitMask<..>>  kernel (For)  Policy to map work
-                                                         directly to threads in a
-                                                         warp using a bit mask.
-                                                         Cannot be used in
-                                                         conjunction with
-                                                         cuda/hip_thread_x_*
-                                                         policies.
-                                                         Multiple warps can
-                                                         be created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_warp_masked_loop<BitMask<..>>    kernel (For)  Policy to map work to
-                                                         threads in a warp using
-                                                         a bit mask and a
-                                                         warp-stride loop. Cannot
-                                                         be used in conjunction
-                                                         with cuda/hip_thread_x_*
-                                                         policies. Multiple warps
-                                                         can be created by using
-                                                         cuda/hip_thread_y/z_*
-                                                         policies.
- cuda/hip_block_reduce                     kernel        Perform a reduction
-                                           (Reduce)      across a single GPU
-                                                         thread block.
- cuda/hip_warp_reduce                      kernel        Perform a reduction
-                                           (Reduce)      across a single GPU
-                                                         thread warp.
- ========================================= ============= =======================================
++-----------------------------------------+---------------+---------------------------------------+
+| CUDA/HIP Execution Policies             | Works with    | Brief description                     |
++=========================================+===============+=======================================+
+| cuda/hip_exec<BLOCK_SIZE>               | forall,       | Execute loop iterations               |
+|                                         | scan,         | directly mapped to global threads     |
+|                                         | sort          | in a GPU kernel launched              |
+|                                         |               | with given thread-block               |
+|                                         |               | size and unbounded grid size.         |
+|                                         |               | Note that the thread-block            |
+|                                         |               | size must be provided,                |
+|                                         |               | there is no default.                  |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_with_reduce<BLOCK_SIZE>   | forall        | The cuda/hip exec policy that is      |
+|                                         |               | recommended for use with reducers.    |
+|                                         |               | In general using the occupancy        |
+|                                         |               | calculator policies are better for    |
+|                                         |               | reducers but exactly how much         |
+|                                         |               | occupancy to use differs by platform  |
+|                                         |               | so this policy provides a simple way  |
+|                                         |               | to get what works best for a platform |
+|                                         |               | without having to know the details.   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_base<with_reduce,         | forall        | Choose between cuda/hip_exec and      |
+|                    BLOCK_SIZE>          |               | cuda/hip_exec_with_reduce policies    |
+|                                         |               | based on the with_reduce boolean.     |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_grid<BLOCK_SIZE,          | forall        | Execute loop iterations               |
+|                    GRID_SIZE>           |               | mapped to global threads via          |
+|                                         |               | grid striding with multiple           |
+|                                         |               | iterations per global thread          |
+|                                         |               | in a GPU kernel launched              |
+|                                         |               | with given thread-block               |
+|                                         |               | size and grid size.                   |
+|                                         |               | Note that the thread-block            |
+|                                         |               | size and grid size must be            |
+|                                         |               | provided, there is no default.        |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_occ_max<BLOCK_SIZE>       | forall        | Execute loop iterations               |
+|                                         |               | mapped to global threads via          |
+|                                         |               | grid striding with multiple           |
+|                                         |               | iterations per global thread          |
+|                                         |               | in a GPU kernel launched              |
+|                                         |               | with given thread-block               |
+|                                         |               | size and grid size bounded            |
+|                                         |               | by the maximum occupancy of           |
+|                                         |               | the kernel.                           |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_occ_calc<BLOCK_SIZE>      | forall        | Similar to the occ_max                |
+|                                         |               | policy but may use less               |
+|                                         |               | than the maximum occupancy            |
+|                                         |               | determined by the occupancy           |
+|                                         |               | calculator of the kernel for          |
+|                                         |               | performance reasons.                  |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_occ_fraction<BLOCK_SIZE,  | forall        | Similar to the occ_max                |
+|   RAJA::Fraction<size_t, numerator,     |               | policy but use a fraction             |
+|                          denominator>>  |               | of the maximum occupancy              |
+|                                         |               | of the kernel.                        |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_exec_occ_custom<BLOCK_SIZE,    | forall        | Similar to the occ_max                |
+|                          Concretizer>   |               | policy but the grid size              |
+|                                         |               | is determined by the                  |
+|                                         |               | concretizer.                          |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_launch_t                       | launch        | Launches a device kernel,             |
+|                                         |               | any code expressed within             |
+|                                         |               | the lambda is executed                |
+|                                         |               | on the device.                        |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_x_direct                | kernel (For)  | Map loop iterates                     |
+|                                         | launch (loop) | directly to GPU threads               |
+|                                         |               | in x-dimension, one                   |
+|                                         |               | iterate per thread                    |
+|                                         |               | (see note below about                 |
+|                                         |               | limitations)                          |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_y_direct                | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to threads in y-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_z_direct                | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to threads in z-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_x_loop                  | kernel (For)  | Similar to                            |
+|                                         | launch (loop) | thread-x-direct                       |
+|                                         |               | policy, but use a                     |
+|                                         |               | block-stride loop which               |
+|                                         |               | doesn't limit number of               |
+|                                         |               | loop iterates                         |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_y_loop                  | kernel (For)  | Same as above, but for                |
+|                                         | launch (loop) | threads in y-dimension                |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_z_loop                  | kernel (For)  | Same as above, but for                |
+|                                         | launch (loop) | threads in z-dimension                |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_syncable_loop<dims...>  | kernel (For)  | Similar to thread-loop                |
+|                                         | launch (loop) | policy, but safe to use               |
+|                                         |               | with Cuda/HipSyncThreads              |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_size_x_direct<nxthreads>| kernel (For)  | Same as thread_x_direct               |
+|                                         | launch (loop) | policy above but with                 |
+|                                         |               | a compile time number of              |
+|                                         |               | threads                               |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_size_y_direct<nythreads>| kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to threads in y-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_thread_size_z_direct<nzthreads>| kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to threads in z-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_flatten_threads_{xyz}_direct   | launch (loop) | Reshapes threads in a                 |
+|                                         |               | multi-dimensional thread              |
+|                                         |               | team into one-dimension,              |
+|                                         |               | accepts any permutation               |
+|                                         |               | of dimensions                         |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_x_direct                 | kernel (For)  | Map loop iterates                     |
+|                                         | launch (loop) | directly to GPU thread                |
+|                                         |               | blocks in x-dimension,                |
+|                                         |               | one iterate per block                 |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_y_direct                 | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to blocks in y-dimension              |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_z_direct                 | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to blocks in z-dimension              |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_x_loop                   | kernel (For)  | Similar to                            |
+|                                         | launch (loop) | block-x-direct policy,                |
+|                                         |               | but use a grid-stride                 |
+|                                         |               | loop.                                 |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_y_loop                   | kernel (For)  | Same as above, but use                |
+|                                         | launch (loop) | blocks in y-dimension                 |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_z_loop                   | kernel (For)  | Same as above, but use                |
+|                                         | launch (loop) | blocks in z-dimension                 |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_size_x_direct<nxblocks>  | kernel (For)  | Same as block_x_direct                |
+|                                         | launch (loop) | policy above but with                 |
+|                                         |               | a compile time number of              |
+|                                         |               | blocks                                |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_size_y_direct<nyblocks>  | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to blocks in y-dim                    |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_block_size_z_direct<nzblocks>  | kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to blocks in z-dim                    |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_x_direct                | kernel (For)  | Creates a unique thread               |
+|                                         | launch (loop) | id for each thread on                 |
+|                                         |               | x-dimension of the grid.              |
+|                                         |               | Same as computing                     |
+|                                         |               | threadIdx.x +                         |
+|                                         |               | threadDim.x * blockIdx.x.             |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_y_direct                | kernel (For)  | Same as above, but uses               |
+|                                         | launch (loop) | globals in y-dimension.               |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_z_direct                | kernel (For)  | Same as above, but uses               |
+|                                         | launch (loop) | globals in z-dimension.               |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_x_loop                  | kernel (For)  | Similar to                            |
+|                                         | launch (loop) | global-x-direct policy,               |
+|                                         |               | but use a grid-stride                 |
+|                                         |               | loop.                                 |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_y_loop                  | kernel (For)  | Same as above, but use                |
+|                                         | launch (loop) | globals in y-dimension                |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_z_loop                  | kernel (For)  | Same as above, but use                |
+|                                         | launch (loop) | globals in z-dimension                |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_size_x_direct<nxthreads>| kernel (For)  | Same as global_x_direct               |
+|                                         | launch (loop) | policy above but with                 |
+|                                         |               | a compile time block                  |
+|                                         |               | size                                  |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_size_y_direct<nythreads>| kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to globals in y-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_global_size_z_direct<nzthreads>| kernel (For)  | Same as above, but map                |
+|                                         | launch (loop) | to globals in z-dim                   |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_warp_direct                    | kernel (For)  | Map work to threads                   |
+|                                         |               | in a warp directly.                   |
+|                                         |               | Cannot be used in                     |
+|                                         |               | conjunction with                      |
+|                                         |               | cuda/hip_thread_x_*                   |
+|                                         |               | policies.                             |
+|                                         |               | Multiple warps can be                 |
+|                                         |               | created by using                      |
+|                                         |               | cuda/hip_thread_y/z_*                 |
+|                                         |               | policies.                             |
++-----------------------------------------+---------------+---------------------------------------+
+| cuda/hip_warp_loop                      | kernel (For)  | Policy to map work to                 |
+|                                         |               | threads in a warp using               |
+|                                         |               | a warp-stride loop.                   |
+|                                         |               | Cannot be used in                     |
+|                                         |               | conjunction with                      |
+|                                         |               | cuda/hip_thread_x_*                   |
+|                                         |               | policies.                             |
+|                                         |               | Multiple warps can be                 |
+|                                         |               | created by using                      |
+|                                         |               | cuda/hip_thread_y/z_*                 |
+|                                         |               | policies.                             |
++-----------------------------------------+---------------+--------------------------------------+
+| cuda/hip_warp_masked_direct<BitMask<..>>| kernel        | Policy to map work                   |
+|                                         | (For)         | directly to threads in a             |
+|                                         |               | warp using a bit mask.               |
+|                                         |               | Cannot be used in                    |
+|                                         |               | conjunction with                     |
+|                                         |               | cuda/hip_thread_x_*                  |
+|                                         |               | policies.                            |
+|                                         |               | Multiple warps can                   |
+|                                         |               | be created by using                  |
+|                                         |               | cuda/hip_thread_y/z_*                |
+|                                         |               | policies.                            |
++-----------------------------------------+---------------+--------------------------------------+
+| cuda/hip_warp_masked_loop<BitMask<..>>  | kernel        | Policy to map work to                |
+|                                         | (For)         | threads in a warp using              |
+|                                         |               | a bit mask and a                     |
+|                                         |               | warp-stride loop. Cannot             |
+|                                         |               | be used in conjunction               |
+|                                         |               | with cuda/hip_thread_x_*             |
+|                                         |               | policies. Multiple warps             |
+|                                         |               | can be created by using              |
+|                                         |               | cuda/hip_thread_y/z_*                |
+|                                         |               | policies.                            |
++-----------------------------------------+---------------+--------------------------------------+
+| cuda/hip_block_reduce                   | kernel        | Perform a reduction                  |
+|                                         | (Reduce)      | across a single GPU                  |
+|                                         |               | thread block.                        |
++-----------------------------------------+---------------+--------------------------------------+
+| cuda/hip_warp_reduce                    | kernel        | Perform a reduction                  |
+|                                         | (Reduce)      | across a single GPU                  |
+|                                         |               | thread warp.                         |
++-----------------------------------------+---------------+--------------------------------------+
 
 When a CUDA or HIP policy leaves parameters like the block size and/or grid size
 unspecified a concretizer object is used to decide those parameters. The

From deee60881fa0dd2d1b42b3221ad289ec78be91ed Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 2 May 2024 14:33:50 -0700
Subject: [PATCH 094/108] Improve table formatting

---
 docs/sphinx/user_guide/feature/policies.rst | 465 ++++++++++----------
 1 file changed, 227 insertions(+), 238 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 50301084ef..5fd9a3f92e 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -236,244 +236,233 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical.
 The only difference is that CUDA policies have the prefix ``cuda_`` and HIP
 policies have the prefix ``hip_``.
 
-+-----------------------------------------+---------------+---------------------------------------+
-| CUDA/HIP Execution Policies             | Works with    | Brief description                     |
-+=========================================+===============+=======================================+
-| cuda/hip_exec<BLOCK_SIZE>               | forall,       | Execute loop iterations               |
-|                                         | scan,         | directly mapped to global threads     |
-|                                         | sort          | in a GPU kernel launched              |
-|                                         |               | with given thread-block               |
-|                                         |               | size and unbounded grid size.         |
-|                                         |               | Note that the thread-block            |
-|                                         |               | size must be provided,                |
-|                                         |               | there is no default.                  |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_with_reduce<BLOCK_SIZE>   | forall        | The cuda/hip exec policy that is      |
-|                                         |               | recommended for use with reducers.    |
-|                                         |               | In general using the occupancy        |
-|                                         |               | calculator policies are better for    |
-|                                         |               | reducers but exactly how much         |
-|                                         |               | occupancy to use differs by platform  |
-|                                         |               | so this policy provides a simple way  |
-|                                         |               | to get what works best for a platform |
-|                                         |               | without having to know the details.   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_base<with_reduce,         | forall        | Choose between cuda/hip_exec and      |
-|                    BLOCK_SIZE>          |               | cuda/hip_exec_with_reduce policies    |
-|                                         |               | based on the with_reduce boolean.     |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_grid<BLOCK_SIZE,          | forall        | Execute loop iterations               |
-|                    GRID_SIZE>           |               | mapped to global threads via          |
-|                                         |               | grid striding with multiple           |
-|                                         |               | iterations per global thread          |
-|                                         |               | in a GPU kernel launched              |
-|                                         |               | with given thread-block               |
-|                                         |               | size and grid size.                   |
-|                                         |               | Note that the thread-block            |
-|                                         |               | size and grid size must be            |
-|                                         |               | provided, there is no default.        |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_occ_max<BLOCK_SIZE>       | forall        | Execute loop iterations               |
-|                                         |               | mapped to global threads via          |
-|                                         |               | grid striding with multiple           |
-|                                         |               | iterations per global thread          |
-|                                         |               | in a GPU kernel launched              |
-|                                         |               | with given thread-block               |
-|                                         |               | size and grid size bounded            |
-|                                         |               | by the maximum occupancy of           |
-|                                         |               | the kernel.                           |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_occ_calc<BLOCK_SIZE>      | forall        | Similar to the occ_max                |
-|                                         |               | policy but may use less               |
-|                                         |               | than the maximum occupancy            |
-|                                         |               | determined by the occupancy           |
-|                                         |               | calculator of the kernel for          |
-|                                         |               | performance reasons.                  |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_occ_fraction<BLOCK_SIZE,  | forall        | Similar to the occ_max                |
-|   RAJA::Fraction<size_t, numerator,     |               | policy but use a fraction             |
-|                          denominator>>  |               | of the maximum occupancy              |
-|                                         |               | of the kernel.                        |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_exec_occ_custom<BLOCK_SIZE,    | forall        | Similar to the occ_max                |
-|                          Concretizer>   |               | policy but the grid size              |
-|                                         |               | is determined by the                  |
-|                                         |               | concretizer.                          |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_launch_t                       | launch        | Launches a device kernel,             |
-|                                         |               | any code expressed within             |
-|                                         |               | the lambda is executed                |
-|                                         |               | on the device.                        |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_x_direct                | kernel (For)  | Map loop iterates                     |
-|                                         | launch (loop) | directly to GPU threads               |
-|                                         |               | in x-dimension, one                   |
-|                                         |               | iterate per thread                    |
-|                                         |               | (see note below about                 |
-|                                         |               | limitations)                          |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_y_direct                | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to threads in y-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_z_direct                | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to threads in z-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_x_loop                  | kernel (For)  | Similar to                            |
-|                                         | launch (loop) | thread-x-direct                       |
-|                                         |               | policy, but use a                     |
-|                                         |               | block-stride loop which               |
-|                                         |               | doesn't limit number of               |
-|                                         |               | loop iterates                         |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_y_loop                  | kernel (For)  | Same as above, but for                |
-|                                         | launch (loop) | threads in y-dimension                |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_z_loop                  | kernel (For)  | Same as above, but for                |
-|                                         | launch (loop) | threads in z-dimension                |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_syncable_loop<dims...>  | kernel (For)  | Similar to thread-loop                |
-|                                         | launch (loop) | policy, but safe to use               |
-|                                         |               | with Cuda/HipSyncThreads              |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_size_x_direct<nxthreads>| kernel (For)  | Same as thread_x_direct               |
-|                                         | launch (loop) | policy above but with                 |
-|                                         |               | a compile time number of              |
-|                                         |               | threads                               |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_size_y_direct<nythreads>| kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to threads in y-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_thread_size_z_direct<nzthreads>| kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to threads in z-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_flatten_threads_{xyz}_direct   | launch (loop) | Reshapes threads in a                 |
-|                                         |               | multi-dimensional thread              |
-|                                         |               | team into one-dimension,              |
-|                                         |               | accepts any permutation               |
-|                                         |               | of dimensions                         |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_x_direct                 | kernel (For)  | Map loop iterates                     |
-|                                         | launch (loop) | directly to GPU thread                |
-|                                         |               | blocks in x-dimension,                |
-|                                         |               | one iterate per block                 |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_y_direct                 | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to blocks in y-dimension              |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_z_direct                 | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to blocks in z-dimension              |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_x_loop                   | kernel (For)  | Similar to                            |
-|                                         | launch (loop) | block-x-direct policy,                |
-|                                         |               | but use a grid-stride                 |
-|                                         |               | loop.                                 |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_y_loop                   | kernel (For)  | Same as above, but use                |
-|                                         | launch (loop) | blocks in y-dimension                 |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_z_loop                   | kernel (For)  | Same as above, but use                |
-|                                         | launch (loop) | blocks in z-dimension                 |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_size_x_direct<nxblocks>  | kernel (For)  | Same as block_x_direct                |
-|                                         | launch (loop) | policy above but with                 |
-|                                         |               | a compile time number of              |
-|                                         |               | blocks                                |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_size_y_direct<nyblocks>  | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to blocks in y-dim                    |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_block_size_z_direct<nzblocks>  | kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to blocks in z-dim                    |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_x_direct                | kernel (For)  | Creates a unique thread               |
-|                                         | launch (loop) | id for each thread on                 |
-|                                         |               | x-dimension of the grid.              |
-|                                         |               | Same as computing                     |
-|                                         |               | threadIdx.x +                         |
-|                                         |               | threadDim.x * blockIdx.x.             |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_y_direct                | kernel (For)  | Same as above, but uses               |
-|                                         | launch (loop) | globals in y-dimension.               |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_z_direct                | kernel (For)  | Same as above, but uses               |
-|                                         | launch (loop) | globals in z-dimension.               |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_x_loop                  | kernel (For)  | Similar to                            |
-|                                         | launch (loop) | global-x-direct policy,               |
-|                                         |               | but use a grid-stride                 |
-|                                         |               | loop.                                 |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_y_loop                  | kernel (For)  | Same as above, but use                |
-|                                         | launch (loop) | globals in y-dimension                |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_z_loop                  | kernel (For)  | Same as above, but use                |
-|                                         | launch (loop) | globals in z-dimension                |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_size_x_direct<nxthreads>| kernel (For)  | Same as global_x_direct               |
-|                                         | launch (loop) | policy above but with                 |
-|                                         |               | a compile time block                  |
-|                                         |               | size                                  |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_size_y_direct<nythreads>| kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to globals in y-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_global_size_z_direct<nzthreads>| kernel (For)  | Same as above, but map                |
-|                                         | launch (loop) | to globals in z-dim                   |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_warp_direct                    | kernel (For)  | Map work to threads                   |
-|                                         |               | in a warp directly.                   |
-|                                         |               | Cannot be used in                     |
-|                                         |               | conjunction with                      |
-|                                         |               | cuda/hip_thread_x_*                   |
-|                                         |               | policies.                             |
-|                                         |               | Multiple warps can be                 |
-|                                         |               | created by using                      |
-|                                         |               | cuda/hip_thread_y/z_*                 |
-|                                         |               | policies.                             |
-+-----------------------------------------+---------------+---------------------------------------+
-| cuda/hip_warp_loop                      | kernel (For)  | Policy to map work to                 |
-|                                         |               | threads in a warp using               |
-|                                         |               | a warp-stride loop.                   |
-|                                         |               | Cannot be used in                     |
-|                                         |               | conjunction with                      |
-|                                         |               | cuda/hip_thread_x_*                   |
-|                                         |               | policies.                             |
-|                                         |               | Multiple warps can be                 |
-|                                         |               | created by using                      |
-|                                         |               | cuda/hip_thread_y/z_*                 |
-|                                         |               | policies.                             |
-+-----------------------------------------+---------------+--------------------------------------+
-| cuda/hip_warp_masked_direct<BitMask<..>>| kernel        | Policy to map work                   |
-|                                         | (For)         | directly to threads in a             |
-|                                         |               | warp using a bit mask.               |
-|                                         |               | Cannot be used in                    |
-|                                         |               | conjunction with                     |
-|                                         |               | cuda/hip_thread_x_*                  |
-|                                         |               | policies.                            |
-|                                         |               | Multiple warps can                   |
-|                                         |               | be created by using                  |
-|                                         |               | cuda/hip_thread_y/z_*                |
-|                                         |               | policies.                            |
-+-----------------------------------------+---------------+--------------------------------------+
-| cuda/hip_warp_masked_loop<BitMask<..>>  | kernel        | Policy to map work to                |
-|                                         | (For)         | threads in a warp using              |
-|                                         |               | a bit mask and a                     |
-|                                         |               | warp-stride loop. Cannot             |
-|                                         |               | be used in conjunction               |
-|                                         |               | with cuda/hip_thread_x_*             |
-|                                         |               | policies. Multiple warps             |
-|                                         |               | can be created by using              |
-|                                         |               | cuda/hip_thread_y/z_*                |
-|                                         |               | policies.                            |
-+-----------------------------------------+---------------+--------------------------------------+
-| cuda/hip_block_reduce                   | kernel        | Perform a reduction                  |
-|                                         | (Reduce)      | across a single GPU                  |
-|                                         |               | thread block.                        |
-+-----------------------------------------+---------------+--------------------------------------+
-| cuda/hip_warp_reduce                    | kernel        | Perform a reduction                  |
-|                                         | (Reduce)      | across a single GPU                  |
-|                                         |               | thread warp.                         |
-+-----------------------------------------+---------------+--------------------------------------+
++----------------------------------------------------+---------------+---------------------------------+
+| CUDA/HIP Execution Policies                        | Works with    | Brief description               |
++====================================================+===============+=================================+
+| cuda/hip_exec<BLOCK_SIZE>                          | forall,       | Execute loop iterations         |
+|                                                    | scan,         | directly mapped to global       |
+|                                                    | sort          | threads in a GPU kernel         |
+|                                                    |               | launched with given threadblock | 
+|                                                    |               | size and unbounded grid size.   |
+|                                                    |               | Note that the threadblock       |
+|                                                    |               | size must be provided.          |
+|                                                    |               | There is no default.            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_with_reduce<BLOCK_SIZE>              | forall        | The cuda/hip exec policy        |
+|                                                    |               | recommended for use with        |
+|                                                    |               | kernels containing reductions.  |
+|                                                    |               | In general, using the occupancy |
+|                                                    |               | calculator policies improves    |
+|                                                    |               | performance of kernels with     |
+|                                                    |               | reductions. Exactly how much    |
+|                                                    |               | occupancy to use differs by     |
+|                                                    |               | platform. This policy provides  |
+|                                                    |               | a simple way to get what works  |
+|                                                    |               | well for a platform without     |
+|                                                    |               | having to know the details.     |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_base<with_reduce, BLOCK_SIZE>        | forall        | Choose between cuda/hip_exec    |
+|                                                    |               | and cuda/hip_exec_with_reduce   |
+|                                                    |               | policies based on the boolean   |
+|                                                    |               | template parameter 'with_reduce'|
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_grid<BLOCK_SIZE, GRID_SIZE>          | forall        | Execute loop iterations         |
+|                                                    |               | mapped to global threads via    |
+|                                                    |               | grid striding with multiple     |
+|                                                    |               | iterations per global thread    |
+|                                                    |               | in a GPU kernel launched        |
+|                                                    |               | with given thread-block         |
+|                                                    |               | size and grid size.             |
+|                                                    |               | Note that the thread-block      |
+|                                                    |               | size and grid size must be      |
+|                                                    |               | provided, there is no default.  |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_max<BLOCK_SIZE>                  | forall        | Execute loop iterations         |
+|                                                    |               | mapped to global threads via    |
+|                                                    |               | grid striding with multiple     |
+|                                                    |               | iterations per global thread    |
+|                                                    |               | in a GPU kernel launched        |
+|                                                    |               | with given thread-block         |
+|                                                    |               | size and grid size bounded      |
+|                                                    |               | by the maximum occupancy of     |
+|                                                    |               | the kernel.                     |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_calc<BLOCK_SIZE>                 | forall        | Similar to the occ_max          |
+|                                                    |               | policy but may use less         |
+|                                                    |               | than the maximum occupancy      |
+|                                                    |               | determined by the occupancy     |
+|                                                    |               | calculator of the kernel for    |
+|                                                    |               | performance reasons.            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_fraction<BLOCK_SIZE,             | forall        | Similar to the occ_max policy   |
+|                            Fraction<size_t,        |               | but use a fraction of the       |
+|                                     numerator,     |               | maximum occupancy of the kernel.|
+|                                     denominator>>  |               |                                 | 
+|                                                    |               |                                 |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_exec_occ_custom<BLOCK_SIZE, Concretizer>  | forall        | Similar to the occ_max policy   |
+|                                                    |               | policy but the grid size is     |
+|                                                    |               | is determined by concretizer.   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_launch_t                                  | launch        | Launches a device kernel, any   |
+|                                                    |               | code inside the lambda          |
+|                                                    |               | expression is executed          |
+|                                                    |               | on the device.                  |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_x_direct                           | kernel (For)  | Map loop iterates directly to   |
+|                                                    | launch (loop) | GPU threads in x-dimension, one |
+|                                                    |               | iterate per thread. See note    |
+|                                                    |               | below about limitations.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_y_direct                           | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_z_direct                           | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_x_loop                             | kernel (For)  | Similar to thread-x-direct      |
+|                                                    | launch (loop) | policy, but use a block-stride  |
+|                                                    |               | loop which doesn't limit total  |
+|                                                    |               | number of loop iterates.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_y_loop                             | kernel (For)  | Same as above, but for          |
+|                                                    | launch (loop) | threads in y-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_z_loop                             | kernel (For)  | Same as above, but for          |
+|                                                    | launch (loop) | threads in z-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_syncable_loop<dims...>             | kernel (For)  | Similar to thread-loop          |
+|                                                    | launch (loop) | policy, but safe to use         |
+|                                                    |               | with Cuda/HipSyncThreads.       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_x_direct<nx_threads>          | kernel (For)  | Same as thread_x_direct         |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time number of        |
+|                                                    |               | threads.                        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_y_direct<ny_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in y-dimension       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_thread_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to threads in z-dimension.      |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_flatten_threads_{xyz}_direct              | launch (loop) | Reshapes threads in a           |
+|                                                    |               | multi-dimensional thread        |
+|                                                    |               | team into one-dimension,        |
+|                                                    |               | accepts any permutation         |
+|                                                    |               | of dimensions                   |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_direct                            | kernel (For)  | Map loop iterates               |
+|                                                    | launch (loop) | directly to GPU thread          |
+|                                                    |               | blocks in x-dimension,          |
+|                                                    |               | one iterate per block           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_y_direct                            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_z_direct                            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dimension        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_x_loop                              | kernel (For)  | Similar to                      |
+|                                                    | launch (loop) | block-x-direct policy,          |
+|                                                    |               | but use a grid-stride           |
+|                                                    |               | loop.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_y_loop                              | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | blocks in y-dimension           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_z_loop                              | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | blocks in z-dimension           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_x_direct<nx_blocks>            | kernel (For)  | Same as block_x_direct          |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time number of        |
+|                                                    |               | blocks                          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_y_direct<ny_blocks>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in y-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_size_z_direct<nz_blocks>            | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to blocks in z-dim              |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_direct                           | kernel (For)  | Creates a unique thread         |
+|                                                    | launch (loop) | id for each thread on           |
+|                                                    |               | x-dimension of the grid.        |
+|                                                    |               | Same as computing               |
+|                                                    |               | threadIdx.x +                   |
+|                                                    |               | threadDim.x * blockIdx.x.       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_y_direct                           | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in y-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_z_direct                           | kernel (For)  | Same as above, but uses         |
+|                                                    | launch (loop) | globals in z-dimension.         |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_x_loop                             | kernel (For)  | Similar to                      |
+|                                                    | launch (loop) | global-x-direct policy,         |
+|                                                    |               | but use a grid-stride           |
+|                                                    |               | loop.                           |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_y_loop                             | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | globals in y-dimension          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_z_loop                             | kernel (For)  | Same as above, but use          |
+|                                                    | launch (loop) | globals in z-dimension          |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_x_direct<nx_threads>          | kernel (For)  | Same as global_x_direct         |
+|                                                    | launch (loop) | policy above but with           |
+|                                                    |               | a compile time block            |
+|                                                    |               | size                            |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_y_direct<ny_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in y-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_global_size_z_direct<nz_threads>          | kernel (For)  | Same as above, but map          |
+|                                                    | launch (loop) | to globals in z-dim             |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_direct                               | kernel (For)  | Map work to threads             |
+|                                                    |               | in a warp directly.             |
+|                                                    |               | Cannot be used in               |
+|                                                    |               | conjunction with                |
+|                                                    |               | cuda/hip_thread_x_*             |
+|                                                    |               | policies.                       |
+|                                                    |               | Multiple warps can be           |
+|                                                    |               | created by using                |
+|                                                    |               | cuda/hip_thread_y/z_*           |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_loop                                 | kernel (For)  | Map work to threads in a warp   |
+|                                                    |               | using a warp-stride loop.       |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_masked_direct<BitMask<..>>           | kernel        | Mmap work directly to threads   |
+|                                                    | (For)         | in a warp using a bit mask.     |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_masked_loop<BitMask<..>>             | kernel        | Map work to threads in a warp   |
+|                                                    | (For)         | using a bit mask and a warp-    |
+|                                                    |               | stride loop.                    |
+|                                                    |               | Cannot be used with             |
+|                                                    |               | cuda/hip_thread_x_* policies.   |
+|                                                    |               | Multiple warps can be created   |
+|                                                    |               | by using cuda/hip_thread_y/z_*  |
+|                                                    |               | policies.                       |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_block_reduce                              | kernel        | Perform a reduction across a    |
+|                                                    | (Reduce)      | single GPU thread block.        |
++----------------------------------------------------+---------------+---------------------------------+
+| cuda/hip_warp_reduce                               | kernel        | Perform a reduction across a    |
+|                                                    | (Reduce)      | single GPU thread warp.         |
+|                                                    |               | thread warp.                    |
++----------------------------------------------------+---------------+---------------------------------+
 
 When a CUDA or HIP policy leaves parameters like the block size and/or grid size
 unspecified a concretizer object is used to decide those parameters. The

From b12ff8faebfb365f8262ec9c7720f6956958f764 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 13:50:48 -0700
Subject: [PATCH 095/108] Fix concretizer table

---
 docs/sphinx/user_guide/feature/policies.rst | 52 ++++++++++-----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 5fd9a3f92e..4c69ef44ca 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -469,33 +469,31 @@ unspecified a concretizer object is used to decide those parameters. The
 following concretizers are available to use in the ``cuda/hip_exec_occ_custom``
 policies:
 
-=================================================== =========================================
-Execution Policy                                    Brief description
-=================================================== =========================================
-
-Cuda/HipDefaultConcretizer                          The default concretizer, expected to
-                                                    provide good performance in general.
-                                                    Note that it may not use max occupancy.
-
-Cuda/HipRecForReduceConcretizer                     Expected to provide good performance
-                                                    in loops with reducers.
-                                                    Note that it may not use max occupancy.
-
-Cuda/HipMaxOccupancyConcretizer                     Uses max occupancy.
-
-Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer    Avoids using the max occupancy of the
-                                                    device in terms of threads.
-                                                    Note that it may use the max occupancy
-                                                    of the kernel if that is below the max
-                                                    occupancy of the device.
-
-Cuda/HipFractionOffsetOccupancyConcretizer<         Uses a fraction and offset to choose an
-Fraction<size_t, numerator, denomenator>,           occupancy based on the max occupancy
-BLOCKS_PER_SM_OFFSET>                               Using the following formula:
-                                                    (Fraction * kernel_max_blocks_per_sm +
-                                                     BLOCKS_PER_SM_OFFSET) * sm_per_device
-
-=================================================== =========================================
++----------------------------------------------------+-----------------------------------------+
+| Execution Policy                                   | Brief description                       |
++====================================================+=========================================+
+| Cuda/HipDefaultConcretizer                         | The default concretizer, expected to    |
+|                                                    | provide good performance in general.    |
+|                                                    | Note that it may not use max occupancy. |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipRecForReduceConcretizer                    | Expected to provide good performance    |
+|                                                    | in loops with reducers.                 |
+|                                                    | Note that it may not use max occupancy. |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipMaxOccupancyConcretizer                    | Uses max occupancy.                     |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer   | Avoids using the max occupancy of the   |
+|                                                    | device in terms of threads.             |
+|                                                    | Note that it may use the max occupancy  |
+|                                                    | of the kernel if that is below the max  |
+|                                                    | occupancy of the device.                |
++----------------------------------------------------+-----------------------------------------+
+| Cuda/HipFractionOffsetOccupancyConcretizer<        | Uses a fraction and offset to choose an |
+|   Fraction<size_t, numerator, denomenator>,        | occupancy based on the max occupancy    |
+|   BLOCKS_PER_SM_OFFSET>                            | Using the following formula:            |
+|                                                    | (Fraction * kernel_max_blocks_per_sm +  |
+|                                                    |  BLOCKS_PER_SM_OFFSET) * sm_per_device  |
++----------------------------------------------------+-----------------------------------------+
 
 Several notable constraints apply to RAJA CUDA/HIP *direct* policies.
 

From f22d6a7e4573aae604c3632ce516ccce862ac619 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 2 May 2024 14:53:18 -0700
Subject: [PATCH 096/108] Attempt to claarify some reduction policy type names.

---
 docs/sphinx/user_guide/feature/policies.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 4c69ef44ca..7660ab2a90 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -797,20 +797,20 @@ cuda/hip_reduce_base<with_atomic>        any CUDA/HIP  Choose between cuda/hip_r
                                          policy        cuda/hip_reduce_atomic policies based on
                                                        the with_atomic boolean.
 cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the
-                                                       memory used for atomics on the host.
+                                         policy        memory used for atomics on the host.
                                                        This works on recent architectures and
                                                        incurs lower overheads.
 cuda/hip_reduce\*device_init\*           any CUDA/HIP  Same as above, but initializes the
-                                                       memory used for atomics on the device.
+                                         policy        memory used for atomics on the device.
                                                        This works on all architectures but
                                                        incurs higher overheads.
-cuda/hip_reduce\*device_fence            any CUDA/HIP  Same as above, and reduction uses normal
+cuda/hip_reduce_device_fence             any CUDA/HIP  Same as above, and reduction uses normal
                                          policy        memory accesses that are not visible across
                                                        the whole device and device scope fences
                                                        to ensure visibility and ordering.
                                                        This works on all architectures but
                                                        incurs higher overheads on some architectures.
-cuda/hip_reduce\*block_fence             any CUDA/HIP  Same as above, and reduction uses special
+cuda/hip_reduce_block_fence              any CUDA/HIP  Same as above, and reduction uses special
                                          policy        memory accesses to a level of cache shared
                                                        visible to the whole device and block scope
                                                        fences to ensure ordering. This improves

From ed9bb20700c8c37af296c88f2e6e45b723472901 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 2 May 2024 14:58:55 -0700
Subject: [PATCH 097/108] Fix SYCL policy table

---
 docs/sphinx/user_guide/feature/policies.rst | 177 ++++++++++----------
 1 file changed, 88 insertions(+), 89 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 7660ab2a90..e68e990d50 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -577,95 +577,94 @@ GPU Policies for SYCL
 	  2 always exists and should be used as one would use the
 	  x dimension for CUDA and HIP.
 
- ======================================== ============= ==============================
- SYCL Execution Policies                  Works with    Brief description
- ======================================== ============= ==============================
- sycl_exec<WORK_GROUP_SIZE>               forall,       Execute loop iterations
-                                                        in a GPU kernel launched
-                                                        with given work group
-                                                        size.
- sycl_launch_t                            launch        Launches a sycl kernel,
-                                                        any code express within
-                                                        the lambda is executed
-                                                        on the device.
- sycl_global_0<WORK_GROUP_SIZE>           kernel (For)  Map loop iterates
-                                                        directly to GPU global
-                                                        ids in first
-                                                        dimension, one iterate
-                                                        per work item. Group
-                                                        execution into work
-                                                        groups of given size.
- sycl_global_1<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
-                                                        to global ids in second
-                                                        dim
- sycl_global_2<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
-                                                        to global ids in third
-                                                        dim
- sycl_global_item_0                       launch (loop) Creates a unique thread
-                                                        id for each thread for
-                                                        dimension 0 of the grid.
-                                                        Same as computing
-                                                        itm.get_group(0) *
-                                                        itm.get_local_range(0) +
-                                                        itm.get_local_id(0).
- sycl_global_item_1                       launch (loop) Same as above, but uses
-                                                        threads in dimension 1
-                                                        Same as computing
-                                                        itm.get_group(1) +
-                                                        itm.get_local_range(1) *
-                                                        itm.get_local_id(1).
- sycl_global_item_2                       launch (loop) Same as above, but uses
-                                                        threads in dimension 2
-                                                        Same as computing
-                                                        itm.get_group(2) +
-                                                        itm.get_local_range(2) *
-                                                        itm.get_local_id(2).
- sycl_local_0_direct                      kernel (For)  Map loop iterates
-                                          launch (loop) directly to GPU work
-                                                        items in first
-                                                        dimension, one iterate
-                                                        per work item (see note
-                                                        below about limitations)
- sycl_local_1_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to work items in second
-                                                        dim
- sycl_local_2_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to work items in third
-                                                        dim
- sycl_local_0_loop                        kernel (For)  Similar to
-                                          launch (loop) local-1-direct policy,
-                                                        but use a work
-                                                        group-stride loop which
-                                                        doesn't limit number of
-                                                        loop iterates
- sycl_local_1_loop                        kernel (For)  Same as above, but for
-                                          launch (loop) work items in second
-                                                        dimension
- sycl_local_2_loop                        kernel (For)  Same as above, but for
-                                          launch (loop) work items in third
-                                                        dimension
- sycl_group_0_direct                      kernel (For)  Map loop iterates
-                                          launch (loop) directly to GPU group
-                                                        ids in first dimension,
-                                                        one iterate per group
- sycl_group_1_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to groups in second
-                                                        dimension
- sycl_group_2_direct                      kernel (For)  Same as above, but map
-                                          launch (loop) to groups in third
-                                                        dimension
- sycl_group_0_loop                        kernel (For)  Similar to
-                                          launch (loop) group-1-direct policy,
-                                                        but use a group-stride
-                                                        loop.
- sycl_group_1_loop                        kernel (For)  Same as above, but use
-                                          launch (loop) groups in second
-                                                        dimension
- sycl_group_2_loop                        kernel (For)  Same as above, but use
-                                          launch (loop) groups in third
-                                                        dimension
-
- ======================================== ============= ==============================
+======================================== ============= ==============================
+SYCL Execution Policies                  Works with    Brief description
+======================================== ============= ==============================
+sycl_exec<WORK_GROUP_SIZE>               forall,       Execute loop iterations
+                                                       in a GPU kernel launched
+                                                       with given work group
+                                                       size.
+sycl_launch_t                            launch        Launches a sycl kernel,
+                                                       any code express within
+                                                       the lambda is executed
+                                                       on the device.
+sycl_global_0<WORK_GROUP_SIZE>           kernel (For)  Map loop iterates
+                                                       directly to GPU global
+                                                       ids in first
+                                                       dimension, one iterate
+                                                       per work item. Group
+                                                       execution into work
+                                                       groups of given size.
+sycl_global_1<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
+                                                       to global ids in second
+                                                       dim
+sycl_global_2<WORK_GROUP_SIZE>           kernel (For)  Same as above, but map
+                                                       to global ids in third
+                                                       dim
+sycl_global_item_0                       launch (loop) Creates a unique thread
+                                                       id for each thread for
+                                                       dimension 0 of the grid.
+                                                       Same as computing
+                                                       itm.get_group(0) *
+                                                       itm.get_local_range(0) +
+                                                       itm.get_local_id(0).
+sycl_global_item_1                       launch (loop) Same as above, but uses
+                                                       threads in dimension 1
+                                                       Same as computing
+                                                       itm.get_group(1) +
+                                                       itm.get_local_range(1) *
+                                                       itm.get_local_id(1).
+sycl_global_item_2                       launch (loop) Same as above, but uses
+                                                       threads in dimension 2
+                                                       Same as computing
+                                                       itm.get_group(2) +
+                                                       itm.get_local_range(2) *
+                                                       itm.get_local_id(2).
+sycl_local_0_direct                      kernel (For)  Map loop iterates
+                                         launch (loop) directly to GPU work
+                                                       items in first
+                                                       dimension, one iterate
+                                                       per work item (see note
+                                                       below about limitations)
+sycl_local_1_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to work items in second
+                                                       dim
+sycl_local_2_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to work items in third
+                                                       dim
+sycl_local_0_loop                        kernel (For)  Similar to
+                                         launch (loop) local-1-direct policy,
+                                                       but use a work
+                                                       group-stride loop which
+                                                       doesn't limit number of
+                                                       loop iterates
+sycl_local_1_loop                        kernel (For)  Same as above, but for
+                                         launch (loop) work items in second
+                                                       dimension
+sycl_local_2_loop                        kernel (For)  Same as above, but for
+                                         launch (loop) work items in third
+                                                       dimension
+sycl_group_0_direct                      kernel (For)  Map loop iterates
+                                         launch (loop) directly to GPU group
+                                                       ids in first dimension,
+                                                       one iterate per group
+sycl_group_1_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to groups in second
+                                                       dimension
+sycl_group_2_direct                      kernel (For)  Same as above, but map
+                                         launch (loop) to groups in third
+                                                       dimension
+sycl_group_0_loop                        kernel (For)  Similar to
+                                         launch (loop) group-1-direct policy,
+                                                       but use a group-stride
+                                                       loop.
+sycl_group_1_loop                        kernel (For)  Same as above, but use
+                                         launch (loop) groups in second
+                                                       dimension
+sycl_group_2_loop                        kernel (For)  Same as above, but use
+                                         launch (loop) groups in third
+                                                       dimension
+======================================== ============= ==============================
 
 OpenMP Target Offload Policies
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

From 960c251c007bfae35f0fa76930d567411253b2d1 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 15:01:49 -0700
Subject: [PATCH 098/108] expand reduce policy table

---
 docs/sphinx/user_guide/feature/policies.rst | 98 +++++++++++----------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index e68e990d50..181d1a5754 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -242,7 +242,7 @@ policies have the prefix ``hip_``.
 | cuda/hip_exec<BLOCK_SIZE>                          | forall,       | Execute loop iterations         |
 |                                                    | scan,         | directly mapped to global       |
 |                                                    | sort          | threads in a GPU kernel         |
-|                                                    |               | launched with given threadblock | 
+|                                                    |               | launched with given threadblock |
 |                                                    |               | size and unbounded grid size.   |
 |                                                    |               | Note that the threadblock       |
 |                                                    |               | size must be provided.          |
@@ -297,7 +297,7 @@ policies have the prefix ``hip_``.
 | cuda/hip_exec_occ_fraction<BLOCK_SIZE,             | forall        | Similar to the occ_max policy   |
 |                            Fraction<size_t,        |               | but use a fraction of the       |
 |                                     numerator,     |               | maximum occupancy of the kernel.|
-|                                     denominator>>  |               |                                 | 
+|                                     denominator>>  |               |                                 |
 |                                                    |               |                                 |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_exec_occ_custom<BLOCK_SIZE, Concretizer>  | forall        | Similar to the occ_max policy   |
@@ -775,49 +775,57 @@ It is important to note the following constraints about RAJA reduction usage:
 
 The following table summarizes RAJA reduction policy types:
 
-======================================== ============= ==========================================
-Reduction Policy                         Loop Policies Brief description
-                                         to Use With
-======================================== ============= ==========================================
-seq_reduce                               seq_exec,     Non-parallel (sequential) reduction.
-omp_reduce                               any OpenMP    OpenMP parallel reduction.
-                                         policy
-omp_reduce_ordered                       any OpenMP    OpenMP parallel reduction with result
-                                         policy        guaranteed to be reproducible.
-omp_target_reduce                        any OpenMP    OpenMP parallel target offload reduction.
-                                         target policy
-cuda/hip_reduce                          any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
-                                         policy        (device synchronization will occur when
-                                                       reduction value is finalized).
-cuda/hip_reduce_atomic                   any CUDA/HIP  Same as above, but reduction may use
-                                         policy        atomic operations leading to run to run
-                                                       variability in the results.
-cuda/hip_reduce_base<with_atomic>        any CUDA/HIP  Choose between cuda/hip_reduce and
-                                         policy        cuda/hip_reduce_atomic policies based on
-                                                       the with_atomic boolean.
-cuda/hip_reduce\*host_init\*             any CUDA/HIP  Same as above, but initializes the
-                                         policy        memory used for atomics on the host.
-                                                       This works on recent architectures and
-                                                       incurs lower overheads.
-cuda/hip_reduce\*device_init\*           any CUDA/HIP  Same as above, but initializes the
-                                         policy        memory used for atomics on the device.
-                                                       This works on all architectures but
-                                                       incurs higher overheads.
-cuda/hip_reduce_device_fence             any CUDA/HIP  Same as above, and reduction uses normal
-                                         policy        memory accesses that are not visible across
-                                                       the whole device and device scope fences
-                                                       to ensure visibility and ordering.
-                                                       This works on all architectures but
-                                                       incurs higher overheads on some architectures.
-cuda/hip_reduce_block_fence              any CUDA/HIP  Same as above, and reduction uses special
-                                         policy        memory accesses to a level of cache shared
-                                                       visible to the whole device and block scope
-                                                       fences to ensure ordering. This improves
-                                                       performance on some architectures.
-sycl_reduce                              any SYCL      Reduction in a SYCL kernel (device
-                                         policy        synchronization will occur when the
-                                                       reduction value is finalized).
-======================================== ============= ==========================================
+================================================= ============= ==========================================
+Reduction Policy                                  Loop Policies Brief description
+                                                  to Use With
+================================================= ============= ==========================================
+seq_reduce                                        seq_exec,     Non-parallel (sequential) reduction.
+omp_reduce                                        any OpenMP    OpenMP parallel reduction.
+                                                  policy
+omp_reduce_ordered                                any OpenMP    OpenMP parallel reduction with result
+                                                  policy        guaranteed to be reproducible.
+omp_target_reduce                                 any OpenMP    OpenMP parallel target offload reduction.
+                                                  target policy
+cuda/hip_reduce                                   any CUDA/HIP  Parallel reduction in a CUDA/HIP kernel
+                                                  policy        (device synchronization will occur when
+                                                                reduction value is finalized).
+cuda/hip_reduce_atomic                            any CUDA/HIP  Same as above, but reduction may use
+                                                  policy        atomic operations leading to run to run
+                                                                variability in the results.
+cuda/hip_reduce_base<with_atomic>                 any CUDA/HIP  Choose between cuda/hip_reduce and
+                                                  policy        cuda/hip_reduce_atomic policies based on
+                                                                the with_atomic boolean.
+cuda/hip_reduce_device_fence                      any CUDA/HIP  Same as above, and reduction uses normal
+                                                  policy        memory accesses that are not visible across
+                                                                the whole device and device scope fences
+                                                                to ensure visibility and ordering.
+                                                                This works on all architectures but
+                                                                incurs higher overheads on some architectures.
+cuda/hip_reduce_block_fence                       any CUDA/HIP  Same as above, and reduction uses special
+                                                  policy        memory accesses to a level of cache
+                                                                visible to the whole device and block scope
+                                                                fences to ensure ordering. This improves
+                                                                performance on some architectures.
+cuda/hip_reduce_atomic_host_init_device_fence     any CUDA/HIP  Same as above with device fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the host. This works well on recent
+                                                                architectures and incurs lower overheads.
+cuda/hip_reduce_atomic_host_init_block_fence      any CUDA/HIP  Same as above with block fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the host. This works well on recent
+                                                                architectures and incurs lower overheads.
+cuda/hip_reduce_atomic_device_init_device_fence   any CUDA/HIP  Same as above with device fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the device. This works on all architectures
+                                                                but incurs higher overheads.
+cuda/hip_reduce_atomic_device_init_block_fence    any CUDA/HIP  Same as above with block fence, but
+                                                  policy        initializes the memory used for atomics
+                                                                on the device. This works on all architectures
+                                                                but incurs higher overheads.
+sycl_reduce                                       any SYCL      Reduction in a SYCL kernel (device
+                                                  policy        synchronization will occur when the
+                                                                reduction value is finalized).
+================================================= ============= ==========================================
 
 .. note:: RAJA reductions used with SIMD execution policies are not
           guaranteed to generate correct results. So they should not be used

From 960c30378ddfd39257d434315a84c87195278978 Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 15:08:18 -0700
Subject: [PATCH 099/108] try something with multi-line tables

---
 docs/sphinx/user_guide/feature/policies.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index 181d1a5754..a7b3c1332a 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -489,8 +489,8 @@ policies:
 |                                                    | occupancy of the device.                |
 +----------------------------------------------------+-----------------------------------------+
 | Cuda/HipFractionOffsetOccupancyConcretizer<        | Uses a fraction and offset to choose an |
-|   Fraction<size_t, numerator, denomenator>,        | occupancy based on the max occupancy    |
-|   BLOCKS_PER_SM_OFFSET>                            | Using the following formula:            |
+| Fraction<size_t, numerator, denomenator>,          | occupancy based on the max occupancy    |
+| BLOCKS_PER_SM_OFFSET>                              | Using the following formula:            |
 |                                                    | (Fraction * kernel_max_blocks_per_sm +  |
 |                                                    |  BLOCKS_PER_SM_OFFSET) * sm_per_device  |
 +----------------------------------------------------+-----------------------------------------+

From 7f85f630393597b005aee91ad346cdbcb5ce683f Mon Sep 17 00:00:00 2001
From: Jason Burmark <burmark1@llnl.gov>
Date: Thu, 2 May 2024 15:19:41 -0700
Subject: [PATCH 100/108] fix more multi line table entries

---
 docs/sphinx/user_guide/feature/policies.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index a7b3c1332a..afd2d7638a 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -295,9 +295,9 @@ policies have the prefix ``hip_``.
 |                                                    |               | performance reasons.            |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_exec_occ_fraction<BLOCK_SIZE,             | forall        | Similar to the occ_max policy   |
-|                            Fraction<size_t,        |               | but use a fraction of the       |
-|                                     numerator,     |               | maximum occupancy of the kernel.|
-|                                     denominator>>  |               |                                 |
+| Fraction<size_t, numerator, denominator>>          |               | but use a fraction of the       |
+|                                                    |               | maximum occupancy of the kernel.|
+|                                                    |               |                                 |
 |                                                    |               |                                 |
 +----------------------------------------------------+---------------+---------------------------------+
 | cuda/hip_exec_occ_custom<BLOCK_SIZE, Concretizer>  | forall        | Similar to the occ_max policy   |

From bc5e371a14610c70a4e329c04570076d02a325ef Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Thu, 2 May 2024 15:21:01 -0700
Subject: [PATCH 101/108] Fix SYCL exec policy table formatting and clarify
 note about SYCL reverse ordering from CUDA/HIP

---
 docs/sphinx/user_guide/feature/policies.rst | 43 +++++++++++++++------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst
index e68e990d50..dd0b14ccd9 100644
--- a/docs/sphinx/user_guide/feature/policies.rst
+++ b/docs/sphinx/user_guide/feature/policies.rst
@@ -561,22 +561,41 @@ write more explicit policies.
 GPU Policies for SYCL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. note:: SYCL uses C++-style ordering in which the right
-	  most index corresponds to having unit stride.
-	  In a three-dimensional compute grid this means
-	  that dimension 2 has the unit stride while
-	  dimension 0 has the longest stride. This is
-	  important to note as the ordering is reverse
-	  compared to the CUDA and HIP programming models.
-	  CUDA and HIP employ a x/y/z ordering in which
-	  dimension x has the unit stride.
-
-	  When using RAJA::launch, thread and team configuration
+.. note:: SYCL uses C++-style ordering for its work group and global thread
+          dimension/indexing types. This is due, in part, to SYCL's closer
+          alignment with C++ multi-dimensional indexing, which is "row-major".
+          This is the reverse of the thread indexing used in CUDA or HIP,
+          which is "column-major". For example, suppose we have a thread-block 
+          or work-group where we specify the shape as (nx, ny, nz). Consider
+          an element in the thread-block or work-group with id (x, y, z).
+          In CUDA or HIP, the element index is x + y * nx + z * nx * ny. In 
+          SYCL, the element index is z + y * nz + x * nz * ny.
+
+          In terms of the CUDA or HIP built-in variables to support threads,
+          we have::
+
+            Thread ID: threadIdx.x/y/z
+            Block ID: blockIdx.x/y/z
+            Block dimension: blockDim.x/y/z
+            Grid dimension: gridDim.x/y/z 
+
+          The analogues in SYCL are::
+
+            Thread ID: sycl::nd_item.get_local_id(2/1/0)
+            Work-group ID: sycl::nd_item.get_group(2/1/0)
+            Work-group dimensions: sycl::nd_item.get_local_range().get(2/1/0)
+            ND-range dimensions: sycl::nd_item.get_group_range(2/1/0) 
+
+	  When using ``RAJA::launch``, thread and block configuration
 	  follows CUDA and HIP programming models and is always
-	  configured in three-dimensions. This means that dimension
+	  configured in three-dimensions. This means that SYCL dimension
 	  2 always exists and should be used as one would use the
 	  x dimension for CUDA and HIP.
 
+          Similarly, ``RAJA::kernel`` uses a three-dimensional work-group
+          configuration. SYCL imension 2 always exists and should be used as
+          one would use the x dimension in CUDA and HIP.  
+
 ======================================== ============= ==============================
 SYCL Execution Policies                  Works with    Brief description
 ======================================== ============= ==============================

From 89004eb0cdb58392aef2a5f4f32fb8f9d7813e12 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 3 May 2024 11:30:46 -0700
Subject: [PATCH 102/108] Bumping poodle allocation time to prevent intel 19
 timeout

---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index cee458cd60..6593a4e357 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -25,7 +25,7 @@ variables:
 
 # Poodle
 # Arguments for top level allocation
-  POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1"
+  POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1"
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle

From 70d4ccedd595872cff1fd6d9fdbeb7fb72f3251c Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 3 May 2024 13:38:12 -0700
Subject: [PATCH 103/108] Remove remnants of loop_exec and associated policies.

These were deprecated a few releases back, but were maintained
at the request of some users.
---
 include/RAJA/RAJA.hpp               |  7 ---
 include/RAJA/policy/loop.hpp        | 35 ------------
 include/RAJA/policy/loop/policy.hpp | 87 -----------------------------
 3 files changed, 129 deletions(-)
 delete mode 100644 include/RAJA/policy/loop.hpp
 delete mode 100644 include/RAJA/policy/loop/policy.hpp

diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 5478392ff1..c37ac997a4 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -59,13 +59,6 @@
 //
 #include "RAJA/policy/sequential.hpp"
 
-//
-// NOTE: LOOP POLCIES WERE DEPRECATED IN 2023.03.0 RELEASE.
-//       THEY ARE RE-ADDED HERE AT REQUEST OF USERS.
-//       THEY WILL BE REMOVED AGAIN IN THE FUTURE.
-//
-#include "RAJA/policy/loop.hpp"
-
 //
 // All platforms should support simd and vector execution.
 //
diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp
deleted file mode 100644
index 2cd9525dcd..0000000000
--- a/include/RAJA/policy/loop.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
-******************************************************************************
-*
-* \file
-*
-* \brief   Header file containing RAJA headers for sequential execution.
-*
-*          These methods work on all platforms.
-*
-******************************************************************************
-*/
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef RAJA_loop_HPP
-#define RAJA_loop_HPP
-
-#if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
-#endif
-
-#include "RAJA/policy/sequential/forall.hpp"
-#include "RAJA/policy/sequential/kernel.hpp"
-#include "RAJA/policy/loop/policy.hpp"
-#include "RAJA/policy/sequential/scan.hpp"
-#include "RAJA/policy/sequential/sort.hpp"
-#include "RAJA/policy/sequential/launch.hpp"
-#include "RAJA/policy/sequential/WorkGroup.hpp"
-
-#endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp
deleted file mode 100644
index 1bf34250bb..0000000000
--- a/include/RAJA/policy/loop/policy.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file containing RAJA sequential policy definitions.
- *
- ******************************************************************************
- */
-
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
-// and RAJA project contributors. See the RAJA/LICENSE file for details.
-//
-// SPDX-License-Identifier: (BSD-3-Clause)
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-#ifndef policy_loop_HPP
-#define policy_loop_HPP
-
-#include "RAJA/policy/PolicyBase.hpp"
-
-#include "RAJA/policy/sequential/policy.hpp"
-
-namespace RAJA
-{
-namespace policy
-{
-namespace loop
-{
-
-//
-//////////////////////////////////////////////////////////////////////
-//
-// Execution policies
-//
-//////////////////////////////////////////////////////////////////////
-//
-
-///
-/// Segment execution policies
-///
-
-using loop_exec = seq_exec;
-
-///
-/// Index set segment iteration policies
-///
-using loop_segit = seq_exec;
-
-///
-/// WorkGroup execution policies
-///
-using loop_work = seq_work;
-
-///
-///////////////////////////////////////////////////////////////////////
-///
-/// Reduction execution policies
-///
-///////////////////////////////////////////////////////////////////////
-///
-using loop_reduce = seq_reduce;
-
-
-///
-///////////////////////////////////////////////////////////////////////
-///
-/// Atomic execution policies
-///
-///////////////////////////////////////////////////////////////////////
-///
-using loop_atomic = seq_atomic;
-
-}  // end namespace loop
-
-}  // end namespace policy
-
-using policy::loop::loop_atomic;
-using policy::loop::loop_exec;
-using policy::loop::loop_reduce;
-using policy::loop::loop_segit;
-using policy::loop::loop_work;
-
-}  // namespace RAJA
-
-#endif

From 8b6ce63a54b5ce399f000d0a0fadb6c2f98b561e Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 3 May 2024 13:59:12 -0700
Subject: [PATCH 104/108] Update version number for release.

---
 CMakeLists.txt | 2 +-
 docs/conf.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1e4823564b..9e5ecec0b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
 # Set version number
 set(RAJA_VERSION_MAJOR 2024)
 set(RAJA_VERSION_MINOR 02)
-set(RAJA_VERSION_PATCHLEVEL 1)
+set(RAJA_VERSION_PATCHLEVEL 2)
 
 if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
   message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")
diff --git a/docs/conf.py b/docs/conf.py
index 1570ed2888..3212170b30 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -88,7 +88,7 @@
 # The short X.Y version.
 version = u'2024.02'
 # The full version, including alpha/beta/rc tags.
-release = u'2024.02.1'
+release = u'2024.02.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From 4e0049ac5e8c47322bdb9477e2effe1c7ba56b6c Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 3 May 2024 14:00:19 -0700
Subject: [PATCH 105/108] Add release notes for v2024.02.2 release.

---
 RELEASE_NOTES.md | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 2e26861191..9efd9b277c 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -20,6 +20,39 @@ Notable changes include:
   * Bug fixes/improvements:
 
 
+Version 2024.02.2 -- Release date 2024-05-08
+============================================
+
+This release contains a bugfix and new execution policies that improve
+performance for GPU kernels with reductions.
+
+Notable changes include:
+
+  * New features / API changes:
+     * New CPU execution policies for CUDA and HIP added which provide
+       improved performance for GPU kernels with reductions. Please see the 
+       RAJA User Guide for more information. Short summary:
+         * Option added to change max grid size in policies that use the
+           occupancy calculator.
+         * Policies added to run with max occupancy, a fraction of of the
+           max occupancy, and to run with a "concretizer" which allows a 
+           user to determine how to run based on what the occupancy 
+           calculator determines about a kernel.
+         * Additional options to tune kernels containing reductions, such as
+             * an option to initialize data on host for reductions that use
+               atomic operations
+             * an option to avoid device scope memory fences 
+     * Change ordering of SYCL thread index ordering in RAJA::launch to 
+       follow the SYCL "row-major" convention. Please see RAJA User Guide
+       for more information.
+
+  * Build changes/improvements:
+     * NONE.
+
+  * Bug fixes/improvements:
+     * Fixed issue in bump-style allocator used internally in RAJA::launch.
+
+
 Version 2024.02.1 -- Release date 2024-04-03
 ============================================
 

From 89d87cd9ce0d5960a0dded5e9e37f558b5079585 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Fri, 3 May 2024 15:30:23 -0700
Subject: [PATCH 106/108] Fix typo.

---
 RELEASE_NOTES.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
index 9efd9b277c..c2df2a03ea 100644
--- a/RELEASE_NOTES.md
+++ b/RELEASE_NOTES.md
@@ -29,7 +29,7 @@ performance for GPU kernels with reductions.
 Notable changes include:
 
   * New features / API changes:
-     * New CPU execution policies for CUDA and HIP added which provide
+     * New GPU execution policies for CUDA and HIP added which provide
        improved performance for GPU kernels with reductions. Please see the 
        RAJA User Guide for more information. Short summary:
          * Option added to change max grid size in policies that use the

From b745c98a9383a959cce92f1568995152e00a1a97 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Mon, 6 May 2024 09:15:00 -0700
Subject: [PATCH 107/108] Update custom-jobs-and-variables.yml

Bump poodle allocation time to avoid timeouts
---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index 6593a4e357..62d7908945 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -25,7 +25,7 @@ variables:
 
 # Poodle
 # Arguments for top level allocation
-  POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1"
+  POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1"
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle

From 919aafde3fcee684ef06768c00fafa1de788e607 Mon Sep 17 00:00:00 2001
From: Rich Hornung <hornung1@llnl.gov>
Date: Mon, 6 May 2024 09:40:46 -0700
Subject: [PATCH 108/108] Update custom-jobs-and-variables.yml

Bump poodle allocation time to prevent timeouts
---
 .gitlab/custom-jobs-and-variables.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml
index 6593a4e357..62d7908945 100644
--- a/.gitlab/custom-jobs-and-variables.yml
+++ b/.gitlab/custom-jobs-and-variables.yml
@@ -25,7 +25,7 @@ variables:
 
 # Poodle
 # Arguments for top level allocation
-  POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1"
+  POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1"
 # Arguments for job level allocation
   POODLE_JOB_ALLOC: "--nodes=1"
 # Project specific variants for poodle