From 2d9dc6dc28ec91aa5e5930fff0159038db107196 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 12 Mar 2024 11:04:30 +0100 Subject: [PATCH 001/108] Update Uberenv + From RSC: Starting the process to move to spack environments --- scripts/radiuss-spack-configs | 2 +- scripts/uberenv | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index af75606a7f..11e8d91e90 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124 +Subproject commit 11e8d91e9093ac237fe00dab36470e24545e4e77 diff --git a/scripts/uberenv b/scripts/uberenv index 4941c237ee..0a39ce245d 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit 4941c237eec514d6d68872243efb9f4af8843f4d +Subproject commit 0a39ce245d7866374bf4724bec9da6ab4cf4dfcc From 053300031f2bb755b7ccd41b2e3712b1112b3994 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:23:25 +0100 Subject: [PATCH 002/108] From RSC: Merge config.yaml content into the environment files --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 11e8d91e90..1dc78a3d2f 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 11e8d91e9093ac237fe00dab36470e24545e4e77 +Subproject commit 1dc78a3d2f8ee919af0127d979e3c0ca4a63df38 From fe6c45ae446166eddb5fdeeae5a0cc86ac7bd130 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:56:27 +0100 Subject: [PATCH 003/108] From RSC: Fix config section names --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 1dc78a3d2f..eb964123fd 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 1dc78a3d2f8ee919af0127d979e3c0ca4a63df38 +Subproject commit eb964123fd422b1bc6cb848f482536009e15a393 From 32c09fa95d5e4fd64c6b6fb6bf1a52bee36d34b0 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 21 Mar 2024 11:01:48 +0100 Subject: [PATCH 004/108] From RSC: Update RADIUSS CI jobs with new compilers on lassen --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index af75606a7f..d8869052d1 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit af75606a7fc0492e35cdd3860337c4e873f43124 +Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c From dcc46c687f788900ed5882b850c106ab07f825e5 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 21 Mar 2024 22:14:59 +0100 Subject: [PATCH 005/108] From RSC: Apply changes to reduce the gap with spack packages --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index d8869052d1..55a4821edc 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c +Subproject commit 55a4821edce8dbad4ef4f36b7e0c34e04984ab74 From d51e0888e5f970107fdc6eecb50a6f8e15ff944b Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 22 Mar 2024 17:07:53 +0100 Subject: [PATCH 006/108] Restore clang 12.0.1 jobs in lassen CI, fix spectrum-mpi paths, enforce compiler versions with @= --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index d8869052d1..9fe6f19c7c 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit d8869052d137137e1a8f2f36a93c10c91ed0e90c +Subproject commit 9fe6f19c7c0900acea9daabf7796798193551773 From fc5ec7b414d53365123959abd5f72a768466fb3c Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 26 Mar 2024 11:29:56 +0100 Subject: [PATCH 007/108] From RSC: changes for Caliper --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 316fffdf09..8938041fb2 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 316fffdf099576b8e90fee06834b7dd66898c49b +Subproject commit 8938041fb20dde5e55ae2014aa71333076d139c9 From a868503d8085772dd5f9212800e2980eca4dec6c Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:04:54 +0100 Subject: [PATCH 008/108] From RSC: Apply new changes from Spack PR + activate vectorization in CI Vectorization support now defaults to false in RAJA Spack package --- .gitlab/custom-jobs-and-variables.yml | 10 +++++----- scripts/radiuss-spack-configs | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index eb7011b78a..f652bb2caf 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -19,7 +19,7 @@ variables: # Note: We repeat the reservation, necessary when jobs are manually re-triggered. RUBY_JOB_ALLOC: "--reservation=ci --nodes=1" # Project specific variants for ruby - PROJECT_RUBY_VARIANTS: "~shared +openmp +tests" + PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for ruby PROJECT_RUBY_DEPS: "" @@ -29,7 +29,7 @@ variables: # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle - PROJECT_POODLE_VARIANTS: "~shared +openmp +tests" + PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for poodle PROJECT_POODLE_DEPS: "" @@ -39,7 +39,7 @@ variables: # Arguments for job level allocation CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona - PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests" + PROJECT_CORONA_VARIANTS: "~shared ~openmp +vectorization +tests" # Project specific deps for corona PROJECT_CORONA_DEPS: "^blt@develop " @@ -49,7 +49,7 @@ variables: # Arguments for job level allocation TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona - PROJECT_TIOGA_VARIANTS: "~shared ~openmp +tests" + PROJECT_TIOGA_VARIANTS: "~shared ~openmp +vectorization +tests" # Project specific deps for corona PROJECT_TIOGA_DEPS: "^blt@develop " @@ -58,7 +58,7 @@ variables: # Arguments for job level allocation LASSEN_JOB_ALLOC: "1 -W 30 -q pci" # Project specific variants for lassen - PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests cuda_arch=70" + PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70" # Project specific deps for lassen PROJECT_LASSEN_DEPS: "^blt@develop " diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 8938041fb2..3d7465cecf 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 8938041fb20dde5e55ae2014aa71333076d139c9 +Subproject commit 3d7465cecf1285064df8a19668ccc66e24b9b388 From ab756d507ac0ea3c7f5c342d30ed908a06100dbc Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:55:36 +0100 Subject: [PATCH 009/108] From RSC: Fix variable name --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 3d7465cecf..6b706f2d20 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 3d7465cecf1285064df8a19668ccc66e24b9b388 +Subproject commit 6b706f2d20d608ea2a9c5e4bf5d6412345b4bd4a From 34d63081d0bf291f77944f8e491fffff5ec6a774 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 27 Mar 2024 09:56:54 +0100 Subject: [PATCH 010/108] From RSC: Remove CUDA_ARCH, Fix MPI utility function (used by RAJAPerf) --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 6b706f2d20..16c942203d 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 6b706f2d20d608ea2a9c5e4bf5d6412345b4bd4a +Subproject commit 16c942203dd4dc42d3e030d7d643c7b5c3f4108b From cbf7ee3ab9e65c476878f70c75af9c0d756ef5eb Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 28 Mar 2024 10:30:45 +0100 Subject: [PATCH 011/108] From RSC: Restore basic MPI support in RAJAPerf --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 16c942203d..5a2b0e7a0b 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 16c942203dd4dc42d3e030d7d643c7b5c3f4108b +Subproject commit 5a2b0e7a0b42d1585d07ee81b78149a3aa5c5544 From c2d87e1bf5ed1710e11a65710460e5480a52d3fe Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 28 Mar 2024 11:20:22 +0100 Subject: [PATCH 012/108] From RSC: RAJAPerf, Umpire, Caliper MPI handling like Axom --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 5a2b0e7a0b..841f99671c 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 5a2b0e7a0b42d1585d07ee81b78149a3aa5c5544 +Subproject commit 841f99671ca0d8bf040f48b07649f06aa0431f51 From 808c7e1c438c8dee2ff033447b3fa4f9263139c0 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 28 Mar 2024 11:41:47 +0100 Subject: [PATCH 013/108] From RSC: Fix missing import --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 841f99671c..63ac13d2ac 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 841f99671ca0d8bf040f48b07649f06aa0431f51 +Subproject commit 63ac13d2acfa0d164b51e193f0e56c48b52afe0d From d18bf5eea12463af5c57703e271bcd9112207a31 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 28 Mar 2024 16:04:26 +0100 Subject: [PATCH 014/108] From RSC: Fix calling class function from outside + clean super arguments --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 63ac13d2ac..fe4b00160f 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 63ac13d2acfa0d164b51e193f0e56c48b52afe0d +Subproject commit fe4b00160f09f8dd4b9d32aff396f3ba0ac8a1e2 From a17c1be9fbc88c031382bf747dd5a56cf5f96cdb Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 28 Mar 2024 18:17:51 +0100 Subject: [PATCH 015/108] Update uberenv --- scripts/uberenv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/uberenv b/scripts/uberenv index 0a39ce245d..cf91883ef0 160000 --- a/scripts/uberenv +++ b/scripts/uberenv @@ -1 +1 @@ -Subproject commit 0a39ce245d7866374bf4724bec9da6ab4cf4dfcc +Subproject commit cf91883ef0500a808338ad6c8b56647da15fa5f3 From f3ecdc2db1a28df43bdf98c7387ee6938b7790a1 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Fri, 29 Mar 2024 16:53:28 +0100 Subject: [PATCH 016/108] From RSC: Fix: use slurm on toss4 cray machines --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 056c003a1e..b09f869f9d 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 056c003a1ed89c301867813a0c20aeb337fc1d6e +Subproject commit b09f869f9d9aff6ecf6544a0161d96c2b18d13b8 From 304224d7da981a89c85dc77c95a9dbda34505f32 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 31 Mar 2024 11:41:47 -0700 Subject: [PATCH 017/108] Add compile time fraction class and test --- include/RAJA/util/types.hpp | 19 ++++++++++ test/unit/util/CMakeLists.txt | 4 ++ test/unit/util/test-fraction.cpp | 64 ++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) create mode 100644 test/unit/util/test-fraction.cpp diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 811f681b9b..3c1aeaf042 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -172,6 +172,25 @@ struct SizeList { }; +/// +/// Compile time fraction for use with integral types +/// +template +struct Fraction +{ + static_assert(denominator != int_t(0), "denominator may not be zero"); + + using inverse = Fraction; + + static constexpr int_t multiply(int_t val) noexcept + { + return (val / denominator) * numerator + + (val % denominator) * numerator / denominator; + } + +}; + + /*! ****************************************************************************** * diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt index fdec220da9..869b897714 100644 --- a/test/unit/util/CMakeLists.txt +++ b/test/unit/util/CMakeLists.txt @@ -21,4 +21,8 @@ raja_add_test( NAME test-span SOURCES test-span.cpp) +raja_add_test( + NAME test-fraction + SOURCES test-fraction.cpp) + add_subdirectory(operator) diff --git a/test/unit/util/test-fraction.cpp b/test/unit/util/test-fraction.cpp new file mode 100644 index 0000000000..5161b2bb3a --- /dev/null +++ b/test/unit/util/test-fraction.cpp @@ -0,0 +1,64 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing tests for Fraction +/// + +#include +#include "RAJA_gtest.hpp" +#include + +template +void testFractionMultiplyTypesValues() +{ + using Frac = RAJA::Fraction; + + ASSERT_EQ(Frac::multiply(IntegerType(0)), IntegerType(0)); + + ASSERT_EQ(Frac::multiply(IntegerType(1)), + IntegerType(double(numerator) / double(denominator))); + + ASSERT_EQ(Frac::multiply(IntegerType(100)), + IntegerType(double(numerator) / double(denominator) * double(100))); + + ASSERT_EQ(Frac::multiply(IntegerType(101)), + IntegerType(double(numerator) / double(denominator) * double(101))); + + // Test where naive algorithm causes overflow, when within precision of double + if /*constexpr*/ (sizeof(IntegerType) < sizeof(double)) { + + static constexpr IntegerType max = std::numeric_limits::max(); + static constexpr IntegerType val = (numerator > denominator) ? + (max / numerator * denominator) : max; + + ASSERT_EQ(Frac::multiply(IntegerType(val)), + IntegerType(double(numerator) / double(denominator) * double(val))); + } + +} + +template +void testFractionMultiplyTypes() +{ + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); + testFractionMultiplyTypesValues(); +} + + +#define RAJA_FRACTION_RUN_TEST(test) \ + test(); \ + test(); + +TEST(Fraction, basic_multiply_Fraction) +{ + RAJA_FRACTION_RUN_TEST(testFractionMultiplyTypes) +} From 2d490a54ed511dcd8f1563a07687da74dac7fdcc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 31 Mar 2024 14:01:16 -0700 Subject: [PATCH 018/108] Add occ_calc_fraction cuda/hip policies Template iteration_mapping types to allow a modifying fraction to be added that is used when calculating the max number of blocks to launch of kernels where the number of blocks is not specified. --- include/RAJA/policy/cuda/forall.hpp | 116 ++++++++++++++---- include/RAJA/policy/cuda/kernel/For.hpp | 14 +-- include/RAJA/policy/cuda/kernel/ForICount.hpp | 20 +-- include/RAJA/policy/cuda/kernel/Tile.hpp | 14 +-- .../RAJA/policy/cuda/kernel/TileTCount.hpp | 20 +-- include/RAJA/policy/cuda/kernel/internal.hpp | 36 +++--- include/RAJA/policy/cuda/launch.hpp | 48 ++++---- include/RAJA/policy/cuda/policy.hpp | 55 ++++++--- include/RAJA/policy/hip/forall.hpp | 116 ++++++++++++++---- include/RAJA/policy/hip/kernel/For.hpp | 14 +-- include/RAJA/policy/hip/kernel/ForICount.hpp | 20 +-- include/RAJA/policy/hip/kernel/Tile.hpp | 14 +-- include/RAJA/policy/hip/kernel/TileTCount.hpp | 20 +-- include/RAJA/policy/hip/kernel/internal.hpp | 36 +++--- include/RAJA/policy/hip/launch.hpp | 48 ++++---- include/RAJA/policy/hip/policy.hpp | 34 +++-- include/RAJA/util/types.hpp | 25 +++- test/include/RAJA_test-forall-execpol.hpp | 6 +- 18 files changed, 415 insertions(+), 241 deletions(-) diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index 3837a8b062..c2ddd67505 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -55,6 +55,57 @@ namespace cuda namespace impl { +/*! + ****************************************************************************** + * + * \brief Cuda grid dimension helper for strided loops template. + * + * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid + * stride loop to run multiple iterates in a single thread. + * + ****************************************************************************** + */ +template +struct GridStrideHelper; + +/// handle direct policies with no modifiers +template<> +struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size)) + { + return normal_grid_size; + } +}; + +/// handle strided loop policies with no modifiers +template<> +struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< + named_usage::unspecified>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) + { + return std::min(normal_grid_size, max_grid_size); + } +}; + +/// handle strided loop policies with multiplier on iterates per thread +template +struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< + named_usage::unspecified, Fraction>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) + { + // use inverse multiplier on max grid size to affect number of threads + using Frac = typename Fraction::inverse; + max_grid_size = Frac::multiply(max_grid_size); + return std::min(normal_grid_size, max_grid_size); + } +}; + /*! ****************************************************************************** * @@ -77,13 +128,14 @@ struct ForallDimensionCalculator; // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; @@ -101,12 +153,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; @@ -120,12 +173,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; @@ -138,11 +192,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, UniqueMarker> { + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > @@ -157,13 +213,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexMapper = ::RAJA::cuda::IndexGlobal; @@ -176,12 +233,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexMapper = ::RAJA::cuda::IndexGlobal; @@ -201,13 +259,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); + using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; using IndexMapper = ::RAJA::cuda::IndexGlobal; template < typename IdxT > @@ -218,7 +277,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, static_cast(IndexMapper::block_size)); - IdxT calculated_grid_size = std::min( + IdxT calculated_grid_size = GridStrideHelper::get_grid_size( RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), static_cast(max_grid_size)); @@ -227,11 +286,12 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, UniqueMarker> { + using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; using IndexMapper = ::RAJA::cuda::IndexGlobal; template < typename IdxT > @@ -241,7 +301,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::CudaOccupancyCalculator oc(func); auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); - IdxT calculated_grid_size = std::min( + IdxT calculated_grid_size = GridStrideHelper::get_grid_size( RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), static_cast(max_sizes.second)); @@ -273,7 +333,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -298,7 +358,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -324,7 +384,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -352,7 +412,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -379,7 +439,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -405,7 +466,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -433,7 +495,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, BlocksPerSM) __global__ @@ -462,7 +525,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp index 11870f13b0..90c26faca6 100644 --- a/include/RAJA/policy/cuda/kernel/For.hpp +++ b/include/RAJA/policy/cuda/kernel/For.hpp @@ -45,7 +45,7 @@ template , + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -60,7 +60,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer>; + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -108,7 +108,7 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -123,7 +123,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer>; + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE @@ -180,7 +180,7 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -195,7 +195,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer>; + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE @@ -246,7 +246,7 @@ struct CudaStatementExecutor< statement::For, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp index dd7c4c4ffe..be0d15feb3 100644 --- a/include/RAJA/policy/cuda/kernel/ForICount.hpp +++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp @@ -47,20 +47,20 @@ template , + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> : CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -103,20 +103,20 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -166,20 +166,20 @@ template , + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -226,7 +226,7 @@ struct CudaStatementExecutor< statement::ForICount, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp index ad54c86a54..615c9943c2 100644 --- a/include/RAJA/policy/cuda/kernel/Tile.hpp +++ b/include/RAJA/policy/cuda/kernel/Tile.hpp @@ -58,7 +58,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -69,7 +69,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -143,7 +143,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -153,7 +153,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -233,7 +233,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -243,7 +243,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -318,7 +318,7 @@ struct CudaStatementExecutor< Data, statement::Tile, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp index 84a0bec412..6b6b7b3197 100644 --- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp +++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp @@ -60,14 +60,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -75,7 +75,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -131,14 +131,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -146,7 +146,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -209,14 +209,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -224,7 +224,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, + RAJA::policy::cuda::cuda_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -281,7 +281,7 @@ struct CudaStatementExecutor< Data, statement::TileTCount, Types> : CudaStatementExecutor, kernel_sync_requirement::none, cuda::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp index a33b564309..ae0e442cdf 100644 --- a/include/RAJA/policy/cuda/kernel/internal.hpp +++ b/include/RAJA/policy/cuda/kernel/internal.hpp @@ -217,7 +217,7 @@ struct KernelDimensionCalculator; // specialization for direct sequential policies template -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -234,7 +234,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -250,7 +250,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -271,7 +271,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -286,7 +286,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -307,7 +307,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -323,7 +323,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -343,7 +343,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -362,7 +362,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -388,7 +388,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -402,7 +402,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -418,7 +418,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -436,7 +436,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -451,7 +451,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -469,7 +469,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -488,7 +488,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -508,7 +508,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { @@ -527,7 +527,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, cuda::IndexGlobal>> { diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 26e56e5cda..5dba388d06 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -348,7 +348,7 @@ struct LaunchExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -371,7 +371,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -399,7 +399,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -433,7 +433,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -457,7 +457,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -493,7 +493,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -538,7 +538,7 @@ struct LoopExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -560,7 +560,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -590,7 +590,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -625,7 +625,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -649,7 +649,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -686,7 +686,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -736,18 +736,18 @@ struct LoopICountExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -777,7 +777,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -810,18 +810,18 @@ struct LoopExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -852,7 +852,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -890,7 +890,7 @@ struct LoopExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -914,7 +914,7 @@ struct TileExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -939,7 +939,7 @@ struct TileExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -964,7 +964,7 @@ struct TileTCountExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 92c1f1c701..4a5875a769 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -883,53 +883,70 @@ using global_z = IndexGlobal; } // namespace cuda // policies usable with forall, scan, and sort + template using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; template using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; template using cuda_exec_grid = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::Direct<>, cuda::global_x, BLOCKS_PER_SM, Async>; template using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::Direct<>, cuda::global_x, BLOCKS_PER_SM, true>; template using cuda_exec = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::Direct<>, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::Direct<>, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_calc_fraction_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_calc_fraction_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_calc_fraction = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; // policies usable with WorkGroup template @@ -960,11 +977,11 @@ using policy::cuda::cuda_block_reduce; using policy::cuda::cuda_warp_reduce; using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, cuda::thread_x>; using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, cuda::thread_x>; @@ -990,31 +1007,31 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t using cuda_indexer_direct = policy::cuda::cuda_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using cuda_indexer_loop = policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::sync, indexers...>; template < typename ... indexers > using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index b0b86131ef..2f9830bb31 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -56,6 +56,57 @@ namespace hip namespace impl { +/*! + ****************************************************************************** + * + * \brief Hip grid dimension helper for strided loops template. + * + * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid + * stride loop to run multiple iterates in a single thread. + * + ****************************************************************************** + */ +template +struct GridStrideHelper; + +/// handle direct policies with no modifiers +template<> +struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size)) + { + return normal_grid_size; + } +}; + +/// handle strided loop policies with no modifiers +template<> +struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< + named_usage::unspecified>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) + { + return std::min(normal_grid_size, max_grid_size); + } +}; + +/// handle strided loop policies with multiplier on iterates per thread +template +struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< + named_usage::unspecified, Fraction>> +{ + template < typename IdxT > + static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) + { + // use inverse multiplier on max grid size to affect number of threads + using Frac = typename Fraction::inverse; + max_grid_size = Frac::multiply(max_grid_size); + return std::min(normal_grid_size, max_grid_size); + } +}; + /*! ****************************************************************************** * @@ -78,13 +129,14 @@ struct ForallDimensionCalculator; // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; @@ -102,12 +154,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; @@ -121,12 +174,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; @@ -139,11 +193,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, UniqueMarker> { + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > @@ -158,13 +214,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexMapper = ::RAJA::hip::IndexGlobal; @@ -177,12 +234,13 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); + static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexMapper = ::RAJA::hip::IndexGlobal; @@ -202,13 +260,14 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); + using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; using IndexMapper = ::RAJA::hip::IndexGlobal; template < typename IdxT > @@ -219,7 +278,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, static_cast(IndexMapper::block_size)); - IdxT calculated_grid_size = std::min( + IdxT calculated_grid_size = GridStrideHelper::get_grid_size( RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), static_cast(max_grid_size)); @@ -228,11 +287,12 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, UniqueMarker> { + using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; using IndexMapper = ::RAJA::hip::IndexGlobal; template < typename IdxT > @@ -242,7 +302,7 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::HipOccupancyCalculator oc(func); auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); - IdxT calculated_grid_size = std::min( + IdxT calculated_grid_size = GridStrideHelper::get_grid_size( RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), static_cast(max_sizes.second)); @@ -273,7 +333,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -297,7 +357,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -322,7 +382,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -349,7 +409,7 @@ template ::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -375,7 +435,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -400,7 +461,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ @@ -427,7 +489,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size > 0), size_t > BlockSize = IterationGetter::block_size> __launch_bounds__(BlockSize, 1) __global__ @@ -455,7 +518,8 @@ template ::value && + std::is_base_of::value && + std::is_base_of::value && (IterationGetter::block_size <= 0), size_t > RAJA_UNUSED_ARG(BlockSize) = 0> __global__ diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp index ce8e87d869..10563bc20e 100644 --- a/include/RAJA/policy/hip/kernel/For.hpp +++ b/include/RAJA/policy/hip/kernel/For.hpp @@ -45,7 +45,7 @@ template , + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -60,7 +60,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer>; + RAJA::policy::hip::hip_indexer, sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -108,7 +108,7 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -123,7 +123,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer>; + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE @@ -180,7 +180,7 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -195,7 +195,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer>; + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE @@ -246,7 +246,7 @@ struct HipStatementExecutor< statement::For, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp index 001cc28b77..be7e256274 100644 --- a/include/RAJA/policy/hip/kernel/ForICount.hpp +++ b/include/RAJA/policy/hip/kernel/ForICount.hpp @@ -47,20 +47,20 @@ template , + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> : HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -103,20 +103,20 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -166,20 +166,20 @@ template , + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -226,7 +226,7 @@ struct HipStatementExecutor< statement::ForICount, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp index 24f38b7647..51a199226f 100644 --- a/include/RAJA/policy/hip/kernel/Tile.hpp +++ b/include/RAJA/policy/hip/kernel/Tile.hpp @@ -58,7 +58,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -69,7 +69,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -143,7 +143,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -153,7 +153,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::sync, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -233,7 +233,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -243,7 +243,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator>; + using DimensionCalculator = KernelDimensionCalculator, kernel_sync_requirement::none, IndexMapper>>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) @@ -318,7 +318,7 @@ struct HipStatementExecutor< Data, statement::Tile, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp index c92f92fb71..72e4114a23 100644 --- a/include/RAJA/policy/hip/kernel/TileTCount.hpp +++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp @@ -60,14 +60,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -75,7 +75,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -131,14 +131,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types> { @@ -146,7 +146,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::sync, IndexMapper>, EnclosedStmts...>, Types>; @@ -209,14 +209,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types> { @@ -224,7 +224,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, + RAJA::policy::hip::hip_indexer, kernel_sync_requirement::none, IndexMapper>, EnclosedStmts...>, Types>; @@ -281,7 +281,7 @@ struct HipStatementExecutor< Data, statement::TileTCount, Types> : HipStatementExecutor, kernel_sync_requirement::none, hip::IndexGlobal>, EnclosedStmts...>, Types> diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp index 2c93520b93..1c520d4af9 100644 --- a/include/RAJA/policy/hip/kernel/internal.hpp +++ b/include/RAJA/policy/hip/kernel/internal.hpp @@ -217,7 +217,7 @@ struct KernelDimensionCalculator; // specialization for direct sequential policies template -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -234,7 +234,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -250,7 +250,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -271,7 +271,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -286,7 +286,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -307,7 +307,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -323,7 +323,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -343,7 +343,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -362,7 +362,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -388,7 +388,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -402,7 +402,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -418,7 +418,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -436,7 +436,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -451,7 +451,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -469,7 +469,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -488,7 +488,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -508,7 +508,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { @@ -527,7 +527,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, sync, hip::IndexGlobal>> { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 2e54b16a81..8f605cb538 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -348,7 +348,7 @@ struct LaunchExecute> { HIP generic loop implementations */ template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -371,7 +371,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -399,7 +399,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -433,7 +433,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -457,7 +457,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -493,7 +493,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -538,7 +538,7 @@ struct LoopExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -560,7 +560,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -590,7 +590,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -625,7 +625,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -649,7 +649,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -686,7 +686,7 @@ struct LoopICountExecute -struct LoopICountExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -736,18 +736,18 @@ struct LoopICountExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -777,7 +777,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -810,18 +810,18 @@ struct LoopExecute -struct LoopExecute, sync, IndexMapper0>, SEGMENT> - : LoopExecute, sync, IndexMapper0>, SEGMENT> {}; template -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1>, @@ -852,7 +852,7 @@ struct LoopExecute -struct LoopExecute, kernel_sync_requirement::none, IndexMapper0, IndexMapper1, @@ -890,7 +890,7 @@ struct LoopExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -914,7 +914,7 @@ struct TileExecute -struct TileExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -939,7 +939,7 @@ struct TileExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { @@ -964,7 +964,7 @@ struct TileTCountExecute -struct TileTCountExecute, kernel_sync_requirement::none, IndexMapper>, SEGMENT> { diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 75f9abd878..9c72cc8993 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -879,27 +879,35 @@ using global_z = IndexGlobal; // policies usable with forall, scan, and sort template using hip_exec_grid = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, Async>; template using hip_exec_grid_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, true>; template using hip_exec = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, Async>; + iteration_mapping::Direct<>, hip::global_x, Async>; template using hip_exec_async = policy::hip::hip_exec< - iteration_mapping::Direct, hip::global_x, true>; + iteration_mapping::Direct<>, hip::global_x, true>; template using hip_exec_occ_calc = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, Async>; template using hip_exec_occ_calc_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, true>; + +template +using hip_exec_occ_calc_fraction = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, Async>; + +template +using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, true>; // policies usable with WorkGroup using policy::hip::hip_work; @@ -923,11 +931,11 @@ using policy::hip::hip_block_reduce; using policy::hip::hip_warp_reduce; using hip_warp_direct = RAJA::policy::hip::hip_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, hip::thread_x>; using hip_warp_loop = RAJA::policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, hip::thread_x>; @@ -947,31 +955,31 @@ using policy::hip::hip_launch_t; // policies usable with kernel and launch template < typename ... indexers > using hip_indexer_direct = policy::hip::hip_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using hip_indexer_loop = policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using hip_indexer_syncable_loop = policy::hip::hip_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::sync, indexers...>; template < typename ... indexers > using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer< - iteration_mapping::Direct, + iteration_mapping::Direct<>, kernel_sync_requirement::none, indexers...>; template < typename ... indexers > using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer< - iteration_mapping::StridedLoop, + iteration_mapping::StridedLoop, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 3c1aeaf042..95b139bce5 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -67,6 +67,18 @@ enum struct kernel_sync_requirement : int namespace iteration_mapping { +struct DirectBase {}; +struct LoopBase {}; +struct ContiguousLoopBase : LoopBase {}; +struct StridedLoopBase : LoopBase {}; +struct UnsizedLoopBase {}; +struct SizedLoopBase {}; +template < size_t t_max_iterations > +struct SizedLoopSpecifyingBase : SizedLoopBase +{ + static constexpr size_t max_iterations = t_max_iterations; +}; + /// /// Direct assumes the loop has enough iterations for all of the indices and /// maps directly from an iteration to an index. @@ -88,7 +100,8 @@ namespace iteration_mapping /// // 3 -> {3} /// // 4 -> {} /// -struct Direct {}; +template < typename ... Modifiers > +struct Direct : DirectBase {}; /// /// Contiguousloop assumes the loop has fewer iterations than indices and @@ -115,7 +128,10 @@ struct Direct {}; /// // 1 -> {3, 4, 5} /// // 2 -> {6, 7} /// -struct Contiguousloop {}; +template < size_t max_iterations, typename ... Modifiers > +struct Contiguousloop : ContiguousLoopBase, + std::conditional_t<(max_iterations != named_usage::unspecified), + SizedLoopSpecifyingBase, UnsizedLoopBase> {}; /// /// StridedLoop assumes the loop has fewer iterations than indices and @@ -142,7 +158,10 @@ struct Contiguousloop {}; /// // 1 -> {1, 4, 7} /// // 2 -> {2, 5} /// -struct StridedLoop {}; +template < size_t max_iterations, typename ... Modifiers > +struct StridedLoop : StridedLoopBase, + std::conditional_t<(max_iterations != named_usage::unspecified), + SizedLoopSpecifyingBase, UnsizedLoopBase> {}; } // namespace iteration_mapping diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 2fe790ff93..33cc17f7eb 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -108,7 +108,8 @@ using OpenMPTargetForallAtomicExecPols = OpenMPTargetForallExecPols; using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, RAJA::cuda_exec_occ_calc<256>, RAJA::cuda_exec_grid<256, 64>, - RAJA::cuda_exec_explicit<256,2> >; + RAJA::cuda_exec_explicit<256,2>, + RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -119,7 +120,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols; #if defined(RAJA_ENABLE_HIP) using HipForallExecPols = camp::list< RAJA::hip_exec<128>, RAJA::hip_exec_occ_calc<256>, - RAJA::hip_exec_grid<256, 64> >; + RAJA::hip_exec_grid<256, 64>, + RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction> >; using HipForallReduceExecPols = HipForallExecPols; From 95a5d07e120bbfdaf7def723f4ba0f4030a0abc6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 31 Mar 2024 14:34:26 -0700 Subject: [PATCH 019/108] Add cuda/hip_occ_calc_recommended policies These policies will represent the recommended way to use the occupancy calculator. --- include/RAJA/policy/cuda/policy.hpp | 16 ++++++++++++++++ include/RAJA/policy/hip/policy.hpp | 8 ++++++++ 2 files changed, 24 insertions(+) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 4a5875a769..90341b9095 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -948,6 +948,22 @@ template using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; +template +using cuda_exec_occ_calc_recommended_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_calc_recommended_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_calc_recommended = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_calc_recommended_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + // policies usable with WorkGroup template using cuda_work_explicit = policy::cuda::cuda_work_explicit; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 9c72cc8993..53ce01dc9f 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -909,6 +909,14 @@ template using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, true>; +template +using hip_exec_occ_calc_recommended = policy::hip::hip_exec< + iteration_mapping::StridedLoop>, hip::global_x, Async>; + +template +using hip_exec_occ_calc_recommended_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop>, hip::global_x, true>; + // policies usable with WorkGroup using policy::hip::hip_work; From 405446048d722b90fdea8c9072a3c2ad8ab4ed38 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 1 Apr 2024 13:40:08 -0700 Subject: [PATCH 020/108] empty From ec5a68c55b16a1d769465877b8d62437136c9b4c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Apr 2024 11:16:37 -0700 Subject: [PATCH 021/108] Add some documentation --- docs/sphinx/user_guide/feature/policies.rst | 7 +++++++ include/RAJA/util/basic_mempool.hpp | 1 + 2 files changed, 8 insertions(+) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index e61be4e598..53ed56bbe1 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -271,6 +271,13 @@ policies have the prefix ``hip_``. default. Note this can improve reducer performance in kernels with large iteration counts. + cuda/hip_exec_occ_calc_recommended forall The same as + cuda/hip_exec_occ_calc + except the grid size upper bound + may be modified from the + maximum occupancy to improve performance. + Note this is the recommended + policy to use with reducers. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp index 61624e0725..f0208ccbd3 100644 --- a/include/RAJA/util/basic_mempool.hpp +++ b/include/RAJA/util/basic_mempool.hpp @@ -309,6 +309,7 @@ class MemPool } + /// Free all backing allocations, even if they are currently in use void free_chunks() { #if defined(RAJA_ENABLE_OPENMP) From dd89cd4d82faa7d26c31fa24ebfcbfafd64b95a3 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Apr 2024 14:25:18 -0700 Subject: [PATCH 022/108] change type used to match recommended policy --- test/include/RAJA_test-forall-execpol.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 33cc17f7eb..458e6d06d0 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -109,7 +109,7 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, RAJA::cuda_exec_occ_calc<256>, RAJA::cuda_exec_grid<256, 64>, RAJA::cuda_exec_explicit<256,2>, - RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction> >; + RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -121,7 +121,7 @@ using CudaForallAtomicExecPols = CudaForallExecPols; using HipForallExecPols = camp::list< RAJA::hip_exec<128>, RAJA::hip_exec_occ_calc<256>, RAJA::hip_exec_grid<256, 64>, - RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction> >; + RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction> >; using HipForallReduceExecPols = HipForallExecPols; From 014aebac5b16c654dfb236b78668217a2dfed390 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 2 Apr 2024 14:26:01 -0700 Subject: [PATCH 023/108] rename recommended policy cuda/hip_exec_occ_calc_recommended changed to cuda/hip_exec_rec_for_reduce --- docs/sphinx/user_guide/feature/policies.rst | 10 +++------- include/RAJA/policy/cuda/policy.hpp | 12 ++++-------- include/RAJA/policy/hip/policy.hpp | 6 ++---- 3 files changed, 9 insertions(+), 19 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 53ed56bbe1..d3f982951a 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -271,13 +271,9 @@ policies have the prefix ``hip_``. default. Note this can improve reducer performance in kernels with large iteration counts. - cuda/hip_exec_occ_calc_recommended forall The same as - cuda/hip_exec_occ_calc - except the grid size upper bound - may be modified from the - maximum occupancy to improve performance. - Note this is the recommended - policy to use with reducers. + cuda/hip_exec_rec_for_reduce forall The cuda/hip exec policy + that is recommended for + use with reducers. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 90341b9095..8e98deeaf2 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -949,20 +949,16 @@ using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_occ_calc_recommended_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; +using cuda_exec_rec_for_reduce_explicit = cuda_exec_occ_calc_explicit; template -using cuda_exec_occ_calc_recommended_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; +using cuda_exec_rec_for_reduce_explicit_async = cuda_exec_occ_calc_explicit_async; template -using cuda_exec_occ_calc_recommended = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; +using cuda_exec_rec_for_reduce = cuda_exec_occ_calc; template -using cuda_exec_occ_calc_recommended_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; +using cuda_exec_rec_for_reduce_async = cuda_exec_occ_calc_async; // policies usable with WorkGroup template diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 53ce01dc9f..49cd489be4 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -910,12 +910,10 @@ using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, true>; template -using hip_exec_occ_calc_recommended = policy::hip::hip_exec< - iteration_mapping::StridedLoop>, hip::global_x, Async>; +using hip_exec_rec_for_reduce = hip_exec_occ_calc_fraction, Async>; template -using hip_exec_occ_calc_recommended_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop>, hip::global_x, true>; +using hip_exec_rec_for_reduce_async = hip_exec_occ_calc_fraction_async>; // policies usable with WorkGroup using policy::hip::hip_work; From 9cbba9148dfe0bff03a47bd131d16532e80a7404 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 12:10:29 -0700 Subject: [PATCH 024/108] Add Concretizer Remove modifiers from loop iteration mappings and move the occupancy calculator modifications into Concretizer classes that are used when block size or grid size is not specified in the ForallDimensionCalculator. --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 253 +++++++++-------- include/RAJA/policy/cuda/forall.hpp | 217 ++++++-------- .../RAJA/policy/cuda/kernel/CudaKernel.hpp | 27 +- include/RAJA/policy/cuda/kernel/For.hpp | 4 +- include/RAJA/policy/cuda/kernel/ForICount.hpp | 6 +- include/RAJA/policy/cuda/kernel/Tile.hpp | 4 +- .../RAJA/policy/cuda/kernel/TileTCount.hpp | 6 +- include/RAJA/policy/cuda/kernel/internal.hpp | 18 +- include/RAJA/policy/cuda/launch.hpp | 24 +- include/RAJA/policy/cuda/policy.hpp | 205 ++++++++++++-- include/RAJA/policy/cuda/scan.hpp | 12 +- include/RAJA/policy/cuda/sort.hpp | 66 +++-- include/RAJA/policy/hip/MemUtils_HIP.hpp | 264 ++++++++++-------- include/RAJA/policy/hip/forall.hpp | 217 ++++++-------- include/RAJA/policy/hip/kernel.hpp | 2 +- include/RAJA/policy/hip/kernel/For.hpp | 4 +- include/RAJA/policy/hip/kernel/ForICount.hpp | 6 +- include/RAJA/policy/hip/kernel/HipKernel.hpp | 27 +- include/RAJA/policy/hip/kernel/Tile.hpp | 4 +- include/RAJA/policy/hip/kernel/TileTCount.hpp | 6 +- include/RAJA/policy/hip/kernel/internal.hpp | 18 +- include/RAJA/policy/hip/launch.hpp | 24 +- include/RAJA/policy/hip/policy.hpp | 154 ++++++++-- include/RAJA/policy/hip/scan.hpp | 12 +- include/RAJA/policy/hip/sort.hpp | 66 +++-- include/RAJA/util/resource.hpp | 20 +- include/RAJA/util/types.hpp | 8 +- test/include/RAJA_test-forall-execpol.hpp | 6 +- 28 files changed, 986 insertions(+), 694 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 2a8f848825..7eee19dacf 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -299,184 +299,207 @@ cudaDeviceProp& device_prop() struct CudaFixedMaxBlocksData { - int multiProcessorCount; - int maxThreadsPerMultiProcessor; + int device_sm_per_device; + int device_max_threads_per_sm; }; RAJA_INLINE -size_t cuda_max_blocks(size_t block_size) +CudaFixedMaxBlocksData cuda_max_blocks() { - static CudaFixedMaxBlocksData data = []() { - cudaDeviceProp& prop = cuda::device_prop(); - return CudaFixedMaxBlocksData{prop.multiProcessorCount, - prop.maxThreadsPerMultiProcessor}; - }(); + static thread_local CudaFixedMaxBlocksData data { + cuda::device_prop().multiProcessorCount, + cuda::device_prop().maxThreadsPerMultiProcessor }; - size_t max_blocks = data.multiProcessorCount * - (data.maxThreadsPerMultiProcessor / block_size); - - return max_blocks; + return data; } struct CudaOccMaxBlocksThreadsData { - size_t prev_shmem_size; - int max_blocks; - int max_threads; + size_t func_dynamic_shmem_per_block; + int func_max_blocks_per_device; + int func_max_threads_per_block; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void cuda_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, - int &max_blocks, int &max_threads) +CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local CudaOccMaxBlocksThreadsData data { + std::numeric_limits::max(), + -1, + -1 }; - if (data.prev_shmem_size != shmem_size) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { cudaErrchk(cudaOccupancyMaxPotentialBlockSize( - &data.max_blocks, &data.max_threads, func, shmem_size)); + &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); - data.prev_shmem_size = shmem_size; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; } - max_blocks = data.max_blocks; - max_threads = data.max_threads; - + return data; } -struct CudaOccMaxBlocksFixedThreadsData +struct CudaOccMaxBlocksData { - size_t prev_shmem_size; - int max_blocks; - int multiProcessorCount; + size_t func_dynamic_shmem_per_block; + int func_threads_per_block; + int device_sm_per_device; + int device_max_threads_per_sm; + int func_max_blocks_per_sm; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks) +CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksFixedThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; - - if (data.prev_shmem_size != shmem_size) { + static thread_local CudaOccMaxBlocksData data { + std::numeric_limits::max(), + func_threads_per_block, + cuda::device_prop().multiProcessorCount, + cuda::device_prop().maxThreadsPerMultiProcessor, + -1 }; - cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); - - if (data.multiProcessorCount == uninitialized) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { - data.multiProcessorCount = cuda::device_prop().multiProcessorCount; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); } - max_blocks = data.max_blocks; - + return data; } -struct CudaOccMaxBlocksVariableThreadsData -{ - size_t prev_shmem_size; - int prev_num_threads; - int max_blocks; - int multiProcessorCount; -}; - -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void cuda_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks, int num_threads) +CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static constexpr int uninitialized = 0; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksVariableThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized, uninitialized}; - - if ( data.prev_shmem_size != shmem_size || - data.prev_num_threads != num_threads ) { - - cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + static thread_local CudaOccMaxBlocksData data { + std::numeric_limits::max(), + -1, + cuda::device_prop().multiProcessorCount, + cuda::device_prop().maxThreadsPerMultiProcessor, + -1 }; - if (data.multiProcessorCount == uninitialized) { + if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || + data.func_threads_per_block != func_threads_per_block ) { - data.multiProcessorCount = cuda::device_prop().multiProcessorCount; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; - data.prev_num_threads = num_threads; + cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); } - max_blocks = data.max_blocks; - + return data; } -struct CudaOccupancyDefaults +/*! + ****************************************************************************** + * + * \brief Cuda Concretizer Implementation. + * + * \tparam IdxT Index type to use for integer calculations. + * \tparam Concretizer Class the determines the max number of blocks to use when + * fitting for the device. + * \tparam UniqueMarker A type that is unique to each global function, used to + * help cache the occupancy data for that global function. + * + ****************************************************************************** + */ +template < typename IdxT, typename Concretizer, typename UniqueMarker> +struct ConcretizerImpl { - CudaOccupancyDefaults(const void* RAJA_UNUSED_ARG(func)) + ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len) + : m_func(func) + , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block) + , m_len(len) { } - template < typename IdxT > - inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size), - IdxT RAJA_UNUSED_ARG(block_size)) const + // Get the maximum block size + IdxT get_max_block_size() const { - return std::numeric_limits::max(); + auto data = cuda_occupancy_max_blocks_threads( + m_func, m_func_dynamic_shmem_per_block); + IdxT func_max_threads_per_block = data.func_max_threads_per_block; + return func_max_threads_per_block; } - template < typename IdxT = cuda_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const + // Get a block size that combined with the given grid size is large enough + // to do len work, or 0 if not possible + IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { - return std::make_pair(static_cast(::RAJA::policy::cuda::MAX_BLOCK_SIZE), - std::numeric_limits::max()); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + if (func_threads_per_block <= func_max_threads_per_block) { + return func_threads_per_block; + } else { + return IdxT(0); + } } -}; -template < typename UniqueMarker > -struct CudaOccupancyCalculator -{ - CudaOccupancyCalculator(const void* func) - : m_func(func) - { } + // Get a grid size that combined with the given block size is large enough + // to do len work + IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const + { + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return func_blocks_per_device; + } + + // Get a block size and grid size that combined is large enough + // to do len work + auto get_block_and_grid_size_to_fit_len() const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); + } + + // Get a block size that combined with the given grid size is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + return std::min(func_threads_per_block, func_max_threads_per_block); + } - template < typename IdxT > - inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const + // Get a grid size that combined with the given block size is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { - int max_grid_size = -1; - ::RAJA::cuda::cuda_occupancy_max_blocks( - m_func, dynamic_shmem_size, max_grid_size, block_size); - return static_cast(max_grid_size); + auto data = cuda_occupancy_max_blocks( + m_func, m_func_dynamic_shmem_per_block, func_threads_per_block); + IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size(data); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return std::min(func_blocks_per_device, func_max_blocks_per_device); } - template < typename IdxT = cuda_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const + // Get a block size and grid size that combined is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + auto get_block_and_grid_size_to_fit_device() const { - int max_block_size = -1; - int max_grid_size = -1; - ::RAJA::cuda::cuda_occupancy_max_blocks_threads( - m_func, dynamic_shmem_size, max_grid_size, max_block_size); - return std::make_pair(static_cast(max_block_size), - static_cast(max_grid_size)); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); } private: const void* m_func; + size_t m_func_dynamic_shmem_per_block; + IdxT m_len; }; } // namespace cuda diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp index c2ddd67505..333f0f90e8 100644 --- a/include/RAJA/policy/cuda/forall.hpp +++ b/include/RAJA/policy/cuda/forall.hpp @@ -55,57 +55,6 @@ namespace cuda namespace impl { -/*! - ****************************************************************************** - * - * \brief Cuda grid dimension helper for strided loops template. - * - * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid - * stride loop to run multiple iterates in a single thread. - * - ****************************************************************************** - */ -template -struct GridStrideHelper; - -/// handle direct policies with no modifiers -template<> -struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size)) - { - return normal_grid_size; - } -}; - -/// handle strided loop policies with no modifiers -template<> -struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< - named_usage::unspecified>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) - { - return std::min(normal_grid_size, max_grid_size); - } -}; - -/// handle strided loop policies with multiplier on iterates per thread -template -struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< - named_usage::unspecified, Fraction>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) - { - // use inverse multiplier on max grid size to affect number of threads - using Frac = typename Fraction::inverse; - max_grid_size = Frac::multiply(max_grid_size); - return std::min(normal_grid_size, max_grid_size); - } -}; - /*! ****************************************************************************** * @@ -121,21 +70,21 @@ struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< * ****************************************************************************** */ -template +template struct ForallDimensionCalculator; // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0 // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; @@ -143,8 +92,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct (static_cast(IndexGetter::block_size) * - static_cast(IndexGetter::grid_size)) ) { + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + if ( len > (block_size * grid_size) ) { RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); } @@ -153,160 +104,168 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - // BEWARE: if calculated block_size is too high then the kernel launch will fail - internal::set_cuda_dim(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::grid_size))); - internal::set_cuda_dim(dims.blocks, static_cast(IndexGetter::grid_size)); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size); + + if ( block_size == IdxT(0) ) { + RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); + } + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - internal::set_cuda_dim(dims.threads, static_cast(IndexGetter::block_size)); - internal::set_cuda_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::block_size))); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size); + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - internal::set_cuda_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_cuda_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first))); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_len(); + + internal::set_cuda_dim(dims.threads, sizes.first); + internal::set_cuda_dim(dims.blocks, sizes.second); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len), const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - internal::set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_block_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::grid_size)), - static_cast(max_sizes.first)); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size); - internal::set_cuda_dim(dims.threads, calculated_block_size); - internal::set_cuda_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, - static_cast(IndexMapper::block_size)); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = GridStrideHelper::get_grid_size( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), - static_cast(max_grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size); - internal::set_cuda_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_cuda_dim(dims.blocks, calculated_grid_size); + internal::set_cuda_dim(dims.threads, block_size); + internal::set_cuda_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::cuda::IndexGlobal, + Concretizer, UniqueMarker> { - using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; - using IndexMapper = ::RAJA::cuda::IndexGlobal; + using IndexGetter = ::RAJA::cuda::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::CudaDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::cuda::CudaOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::cuda::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = GridStrideHelper::get_grid_size( - RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), - static_cast(max_sizes.second)); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_device(); - internal::set_cuda_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_cuda_dim(dims.blocks, calculated_grid_size); + internal::set_cuda_dim(dims.threads, sizes.first); + internal::set_cuda_dim(dims.blocks, sizes.second); } }; @@ -558,7 +517,7 @@ void forallp_cuda_kernel(LOOP_BODY loop_body, template RAJA_INLINE concepts::enable_if_t< @@ -566,7 +525,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> forall_impl(resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicitconst&, + ::RAJA::policy::cuda::cuda_exec_explicitconst&, Iterable&& iter, LoopBody&& loop_body, ForallParam) @@ -574,9 +533,9 @@ forall_impl(resources::Cuda cuda_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; + using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -627,7 +586,7 @@ forall_impl(resources::Cuda cuda_res, template RAJA_INLINE concepts::enable_if_t< @@ -635,7 +594,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty> > forall_impl(resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit const&, + ::RAJA::policy::cuda::cuda_exec_explicit const&, Iterable&& iter, LoopBody&& loop_body, ForallParam f_params) @@ -643,9 +602,9 @@ forall_impl(resources::Cuda cuda_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; + using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit; using UniqueMarker = ::camp::list, LOOP_BODY, Iterator, ForallParam>; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -723,11 +682,11 @@ forall_impl(resources::Cuda cuda_res, */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Cuda r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -736,7 +695,7 @@ forall_impl(resources::Cuda r, iset.segmentCall(r, isi, detail::CallForall(), - ::RAJA::policy::cuda::cuda_exec_explicit(), + ::RAJA::policy::cuda::cuda_exec_explicit(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp index 6497a64f42..c070d618ea 100644 --- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp +++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp @@ -87,7 +87,7 @@ namespace statement */ template struct CudaKernelExt - : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit, EnclosedStmts...> { + : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit, EnclosedStmts...> { }; @@ -284,7 +284,7 @@ struct CudaLaunchHelper(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -294,8 +294,10 @@ struct CudaLaunchHelper( - func, shmem_size, recommended_blocks, recommended_threads); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_device; + recommended_threads = data.func_max_threads_per_block; } else { @@ -305,8 +307,9 @@ struct CudaLaunchHelper( - func, shmem_size, recommended_blocks); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } @@ -360,7 +363,7 @@ struct CudaLaunchHelper(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -373,16 +376,18 @@ struct CudaLaunchHelper( - func, shmem_size, max_blocks, actual_threads); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size, actual_threads); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } else { // // determine blocks when actual_threads == num_threads // - ::RAJA::cuda::cuda_occupancy_max_blocks( - func, shmem_size, max_blocks); + auto data = ::RAJA::cuda::cuda_occupancy_max_blocks( + func, shmem_size); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp index 90c26faca6..9de20c7b4b 100644 --- a/include/RAJA/policy/cuda/kernel/For.hpp +++ b/include/RAJA/policy/cuda/kernel/For.hpp @@ -45,7 +45,7 @@ template , sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> { @@ -60,7 +60,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>>; + RAJA::policy::cuda::cuda_indexer>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp index be0d15feb3..8486abaa2c 100644 --- a/include/RAJA/policy/cuda/kernel/ForICount.hpp +++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp @@ -47,20 +47,20 @@ template , sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> : CudaStatementExecutor< Data, statement::For, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> { using Base = CudaStatementExecutor< Data, statement::For, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types>; diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp index 615c9943c2..ad901f6b02 100644 --- a/include/RAJA/policy/cuda/kernel/Tile.hpp +++ b/include/RAJA/policy/cuda/kernel/Tile.hpp @@ -58,7 +58,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> { @@ -69,7 +69,7 @@ struct CudaStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator, sync, IndexMapper>>; + using DimensionCalculator = KernelDimensionCalculator>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp index 6b6b7b3197..c611346d46 100644 --- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp +++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp @@ -60,14 +60,14 @@ struct CudaStatementExecutor< Data, statement::TileTCount, - RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> : public CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types> { @@ -75,7 +75,7 @@ struct CudaStatementExecutor< Data, statement::Tile, - RAJA::policy::cuda::cuda_indexer, sync, IndexMapper>, + RAJA::policy::cuda::cuda_indexer, EnclosedStmts...>, Types>; diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp index ae0e442cdf..9c904ea45a 100644 --- a/include/RAJA/policy/cuda/kernel/internal.hpp +++ b/include/RAJA/policy/cuda/kernel/internal.hpp @@ -217,7 +217,7 @@ struct KernelDimensionCalculator; // specialization for direct sequential policies template -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -234,7 +234,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -250,7 +250,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -271,7 +271,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -286,7 +286,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -307,7 +307,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -323,7 +323,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -343,7 +343,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -362,7 +362,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index 5dba388d06..602221e58a 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -348,7 +348,7 @@ struct LaunchExecute -struct LoopExecute, +struct LoopExecute, SEGMENT> { @@ -371,7 +371,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute, @@ -399,7 +399,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute -struct LoopICountExecute, +struct LoopICountExecute, SEGMENT> { @@ -560,7 +560,7 @@ struct LoopICountExecute -struct LoopICountExecute, +struct LoopICountExecute, @@ -590,7 +590,7 @@ struct LoopICountExecute -struct LoopICountExecute, +struct LoopICountExecute -struct LoopExecute, +struct LoopExecute, SEGMENT> - : LoopExecute, + : LoopExecute, SEGMENT> {}; template -struct LoopExecute, +struct LoopExecute, @@ -777,7 +777,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute -struct TileExecute, +struct TileExecute, SEGMENT> { @@ -939,7 +939,7 @@ struct TileExecute -struct TileTCountExecute, +struct TileTCountExecute, SEGMENT> { diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 8e98deeaf2..f8ec6773c8 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -22,6 +22,7 @@ #if defined(RAJA_CUDA_ACTIVE) +#include #include #include "RAJA/pattern/reduce.hpp" @@ -78,6 +79,86 @@ struct IndexGlobal; template struct IndexFlatten; +/*! + * Use the max occupancy of a kernel on the current device when launch + * parameters are not fully determined. + * Note that the maximum occupancy of the kernel may be less than the maximum + * occupancy of the device in terms of total threads. + */ +struct MaxOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use a fraction and an offset of the max occupancy of a kernel on the current + * device when launch parameters are not fully determined. + * The following formula is used, with care to avoid zero, to determine the + * maximum grid size: + * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm + */ +template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +struct FractionOffsetOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + using Fraction = typename t_Fraction::template rebind; + + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) { + func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm); + } + + if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) { + func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET); + } + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use an occupancy that is less than the max occupancy of the device when + * launch parameters are not fully determined. + * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is + * below the maximum occupancy of the device. + * Otherwise use the given AvoidMaxOccupancyCalculator to determine the + * maximum grid size. + */ +template < typename AvoidMaxOccupancyConcretizer > +struct AvoidDeviceMaxThreadOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_max_threads_per_sm = data.device_max_threads_per_sm; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + IdxT func_threads_per_block = data.func_threads_per_block; + + IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm; + + if (func_max_threads_per_sm < device_max_threads_per_sm) { + return MaxOccupancyConcretizer::template get_max_grid_size(data); + } else { + return AvoidMaxOccupancyConcretizer::template get_max_grid_size(data); + } + } +}; + } // namespace cuda namespace policy @@ -100,7 +181,8 @@ struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>; }; -template +template struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::forall, @@ -108,9 +190,11 @@ struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::cuda> { using IterationMapping = _IterationMapping; using IterationGetter = _IterationGetter; + using LaunchConcretizer = _LaunchConcretizer; }; -template +template struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::cuda, RAJA::Pattern::region, @@ -119,8 +203,6 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform }; - - // // NOTE: There is no Index set segment iteration policy for CUDA // @@ -882,83 +964,144 @@ using global_z = IndexGlobal; } // namespace cuda +// contretizers used in forall, scan, and sort policies + +using CudaDefaultAvoidMaxOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer, -1>; + +template < typename AvoidMaxOccupancyConcretizer > +using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer; + +template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer; + +using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer; + +using CudaRecForReduceConcretizer = cuda::MaxOccupancyConcretizer; + +using CudaDefaultConcretizer = cuda::MaxOccupancyConcretizer; + // policies usable with forall, scan, and sort template using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec_grid = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct<>, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct<>, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct<>, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::Direct<>, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::Direct, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_occ_calc_fraction_explicit = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, Async>; +using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, BLOCKS_PER_SM, Async>; template -using cuda_exec_occ_calc_fraction_explicit_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, BLOCKS_PER_SM, true>; +using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, BLOCKS_PER_SM, true>; template -using cuda_exec_occ_calc_fraction = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, Async>; +using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template -using cuda_exec_occ_calc_fraction_async = policy::cuda::cuda_exec_explicit< - iteration_mapping::StridedLoop, cuda::global_x, policy::cuda::MIN_BLOCKS_PER_SM, true>; +using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_rec_for_reduce_explicit = cuda_exec_occ_calc_explicit; +using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaRecForReduceConcretizer, BLOCKS_PER_SM, Async>; template -using cuda_exec_rec_for_reduce_explicit_async = cuda_exec_occ_calc_explicit_async; +using cuda_exec_rec_for_reduce_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaRecForReduceConcretizer, BLOCKS_PER_SM, true>; template -using cuda_exec_rec_for_reduce = cuda_exec_occ_calc; +using cuda_exec_rec_for_reduce = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template -using cuda_exec_rec_for_reduce_async = cuda_exec_occ_calc_async; +using cuda_exec_rec_for_reduce_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + // policies usable with WorkGroup template @@ -989,7 +1132,7 @@ using policy::cuda::cuda_block_reduce; using policy::cuda::cuda_warp_reduce; using cuda_warp_direct = RAJA::policy::cuda::cuda_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, cuda::thread_x>; using cuda_warp_loop = RAJA::policy::cuda::cuda_indexer< @@ -1019,7 +1162,7 @@ using cuda_launch_t = policy::cuda::cuda_launch_explicit_t using cuda_indexer_direct = policy::cuda::cuda_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, indexers...>; @@ -1037,7 +1180,7 @@ using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer< template < typename ... indexers > using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp index 5d89844e3c..0a9b0bf305 100644 --- a/include/RAJA/policy/cuda/scan.hpp +++ b/include/RAJA/policy/cuda/scan.hpp @@ -44,6 +44,7 @@ namespace scan */ template inclusive_inplace( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op) @@ -96,6 +97,7 @@ inclusive_inplace( */ template exclusive_inplace( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, Function binary_op, @@ -152,6 +154,7 @@ exclusive_inplace( */ template inclusive( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, @@ -206,6 +209,7 @@ inclusive( */ template exclusive( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp index 6e6e4c5696..c5a353b704 100644 --- a/include/RAJA/policy/cuda/sort.hpp +++ b/include/RAJA/policy/cuda/sort.hpp @@ -44,7 +44,9 @@ namespace sort /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -54,7 +56,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter, Iter, Compare) @@ -75,13 +77,15 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter begin, Iter end, operators::less>) @@ -143,13 +147,15 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter begin, Iter end, operators::greater>) @@ -212,7 +218,9 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -222,7 +230,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, Iter, Iter, Compare) @@ -243,13 +251,15 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, Iter begin, Iter end, operators::less> comp) @@ -260,13 +270,15 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, Iter begin, Iter end, operators::greater> comp) @@ -278,7 +290,8 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -314,7 +327,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -322,7 +336,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -396,7 +410,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -404,7 +419,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -479,7 +494,8 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit, + ::RAJA::policy::cuda::cuda_exec_explicit, KeyIter, KeyIter, ValIter, @@ -515,7 +531,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -523,7 +540,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -535,7 +552,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -543,7 +561,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Cuda cuda_res, - ::RAJA::policy::cuda::cuda_exec_explicit p, + ::RAJA::policy::cuda::cuda_exec_explicit p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index e45d3a6aff..9b8442637b 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -301,203 +301,227 @@ hipDeviceProp_t& device_prop() struct HipFixedMaxBlocksData { - int multiProcessorCount; - int maxThreadsPerMultiProcessor; + int device_sm_per_device; + int device_max_threads_per_sm; }; RAJA_INLINE -int hip_max_blocks(int block_size) +HipFixedMaxBlocksData hip_max_blocks() { - static HipFixedMaxBlocksData data = []() { - hipDeviceProp_t& prop = hip::device_prop(); - return HipFixedMaxBlocksData{prop.multiProcessorCount, - prop.maxThreadsPerMultiProcessor}; - }(); - - int max_blocks = data.multiProcessorCount * - (data.maxThreadsPerMultiProcessor / block_size); + static thread_local HipFixedMaxBlocksData data { + hip::device_prop().multiProcessorCount, + hip::device_prop().maxThreadsPerMultiProcessor }; - return max_blocks; + return data; } struct HipOccMaxBlocksThreadsData { - size_t prev_shmem_size; - int max_blocks; - int max_threads; + size_t func_dynamic_shmem_per_block; + int func_max_blocks_per_device; + int func_max_threads_per_block; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void hip_occupancy_max_blocks_threads(Func&& func, size_t shmem_size, - int &max_blocks, int &max_threads) +HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksThreadsData data { + std::numeric_limits::max(), + -1, + -1 }; - if (data.prev_shmem_size != shmem_size) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxPotentialBlockSize( - &data.max_blocks, &data.max_threads, func, shmem_size)); + &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); hipDeviceProp_t& prop = hip::device_prop(); - data.max_blocks = prop.multiProcessorCount; - data.max_threads = 1024; + data.func_max_blocks_per_device = prop.multiProcessorCount; + data.func_max_threads_per_block = 1024; #endif - data.prev_shmem_size = shmem_size; + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; } - max_blocks = data.max_blocks; - max_threads = data.max_threads; - + return data; } -struct HipOccMaxBlocksFixedThreadsData +struct HipOccMaxBlocksData { - size_t prev_shmem_size; - int max_blocks; - int multiProcessorCount; + size_t func_dynamic_shmem_per_block; + int func_threads_per_block; + int device_sm_per_device; + int device_max_threads_per_sm; + int func_max_blocks_per_sm; }; -template < typename RAJA_UNUSED_ARG(UniqueMarker), int num_threads, typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks) +HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksFixedThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksData data { + std::numeric_limits::max(), + func_threads_per_block, + hip::device_prop().multiProcessorCount, + hip::device_prop().maxThreadsPerMultiProcessor, + -1 }; - if (data.prev_shmem_size != shmem_size) { + if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { + + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); - data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024; - if (data.max_blocks <= 0) { data.max_blocks = 1 } + data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024; + if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 } #endif - if (data.multiProcessorCount == uninitialized) { - - data.multiProcessorCount = hip::device_prop().multiProcessorCount; - - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; } - max_blocks = data.max_blocks; - + return data; } -struct HipOccMaxBlocksVariableThreadsData -{ - size_t prev_shmem_size; - int prev_num_threads; - int max_blocks; - int multiProcessorCount; -}; - -template < typename RAJA_UNUSED_ARG(UniqueMarker), typename Func > +template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE -void hip_occupancy_max_blocks(Func&& func, size_t shmem_size, - int &max_blocks, int num_threads) +HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, + size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static constexpr int uninitialized = 0; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local HipOccMaxBlocksVariableThreadsData data = { - uninitialized_size_t, uninitialized, uninitialized, uninitialized}; + static thread_local HipOccMaxBlocksData data { + std::numeric_limits::max(), + -1, + hip::device_prop().multiProcessorCount, + hip::device_prop().maxThreadsPerMultiProcessor, + -1 }; + + if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || + data.func_threads_per_block != func_threads_per_block ) { - if ( data.prev_shmem_size != shmem_size || - data.prev_num_threads != num_threads ) { + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &data.max_blocks, func, num_threads, shmem_size)); + &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); #else RAJA_UNUSED_VAR(func); - data.max_blocks = hip::device_prop().maxThreadsPerMultiProcessor/1024; - if (data.max_blocks <= 0) { data.max_blocks = 1 } + data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024; + if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 } #endif - if (data.multiProcessorCount == uninitialized) { - - data.multiProcessorCount = hip::device_prop().multiProcessorCount; - - } - - data.max_blocks *= data.multiProcessorCount; - - data.prev_shmem_size = shmem_size; - data.prev_num_threads = num_threads; - } - max_blocks = data.max_blocks; - + return data; } -struct HipOccupancyDefaults +/*! + ****************************************************************************** + * + * \brief Hip Concretizer Implementation. + * + * \tparam IdxT Index type to use for integer calculations. + * \tparam Concretizer Class the determines the max number of blocks to use when + * fitting for the device. + * \tparam UniqueMarker A type that is unique to each global function, used to + * help cache the occupancy data for that global function. + * + ****************************************************************************** + */ +template < typename IdxT, typename Concretizer, typename UniqueMarker> +struct ConcretizerImpl { - HipOccupancyDefaults(const void* RAJA_UNUSED_ARG(func)) + ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len) + : m_func(func) + , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block) + , m_len(len) { } - template < typename IdxT > - inline auto get_max_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size), - IdxT RAJA_UNUSED_ARG(block_size)) const + // Get the maximum block size + IdxT get_max_block_size() const { - return std::numeric_limits::max(); + auto data = hip_occupancy_max_blocks_threads( + m_func, m_func_dynamic_shmem_per_block); + IdxT func_max_threads_per_block = data.func_max_threads_per_block; + return func_max_threads_per_block; } - template < typename IdxT = hip_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) const + // Get a block size that combined with the given grid size is large enough + // to do len work, or 0 if not possible + IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { - return std::make_pair(static_cast(::RAJA::policy::hip::MAX_BLOCK_SIZE), - std::numeric_limits::max()); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + if (func_threads_per_block <= func_max_threads_per_block) { + return func_threads_per_block; + } else { + return IdxT(0); + } } -}; -template < typename UniqueMarker > -struct HipOccupancyCalculator -{ - HipOccupancyCalculator(const void* func) - : m_func(func) - { } + // Get a grid size that combined with the given block size is large enough + // to do len work + IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const + { + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return func_blocks_per_device; + } + + // Get a block size and grid size that combined is large enough + // to do len work + auto get_block_and_grid_size_to_fit_len() const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); + } + + // Get a block size that combined with the given grid size is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const + { + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device); + return std::min(func_threads_per_block, func_max_threads_per_block); + } - template < typename IdxT > - inline auto get_max_grid_size(size_t dynamic_shmem_size, IdxT block_size) const + // Get a grid size that combined with the given block size is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { - int max_grid_size = -1; - ::RAJA::hip::hip_occupancy_max_blocks( - m_func, dynamic_shmem_size, max_grid_size, block_size); - return static_cast(max_grid_size); + auto data = hip_occupancy_max_blocks( + m_func, m_func_dynamic_shmem_per_block, func_threads_per_block); + IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size(data); + IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); + return std::min(func_blocks_per_device, func_max_blocks_per_device); } - template < typename IdxT = hip_dim_member_t > - inline auto get_max_block_size_and_grid_size(size_t dynamic_shmem_size) const + // Get a block size and grid size that combined is the smaller of + // the amount need to achieve maximum occupancy on the device or + // the amount needed to do len work + auto get_block_and_grid_size_to_fit_device() const { - int max_block_size = -1; - int max_grid_size = -1; - ::RAJA::hip::hip_occupancy_max_blocks_threads( - m_func, dynamic_shmem_size, max_grid_size, max_block_size); - return std::make_pair(static_cast(max_block_size), - static_cast(max_grid_size)); + IdxT func_max_threads_per_block = this->get_max_block_size(); + IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block); + return std::make_pair(func_max_threads_per_block, + func_blocks_per_device); } private: const void* m_func; + size_t m_func_dynamic_shmem_per_block; + IdxT m_len; }; } // namespace hip diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp index 2f9830bb31..6fa21f9217 100644 --- a/include/RAJA/policy/hip/forall.hpp +++ b/include/RAJA/policy/hip/forall.hpp @@ -56,57 +56,6 @@ namespace hip namespace impl { -/*! - ****************************************************************************** - * - * \brief Hip grid dimension helper for strided loops template. - * - * \tparam MappingModifiers Decide how many blocks to use cased on the . For example StridedLoop uses a grid - * stride loop to run multiple iterates in a single thread. - * - ****************************************************************************** - */ -template -struct GridStrideHelper; - -/// handle direct policies with no modifiers -template<> -struct GridStrideHelper<::RAJA::iteration_mapping::Direct<>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT RAJA_UNUSED_ARG(max_grid_size)) - { - return normal_grid_size; - } -}; - -/// handle strided loop policies with no modifiers -template<> -struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< - named_usage::unspecified>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) - { - return std::min(normal_grid_size, max_grid_size); - } -}; - -/// handle strided loop policies with multiplier on iterates per thread -template -struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< - named_usage::unspecified, Fraction>> -{ - template < typename IdxT > - static constexpr IdxT get_grid_size(IdxT normal_grid_size, IdxT max_grid_size) - { - // use inverse multiplier on max grid size to affect number of threads - using Frac = typename Fraction::inverse; - max_grid_size = Frac::multiply(max_grid_size); - return std::min(normal_grid_size, max_grid_size); - } -}; - /*! ****************************************************************************** * @@ -122,21 +71,21 @@ struct GridStrideHelper<::RAJA::iteration_mapping::StridedLoop< * ****************************************************************************** */ -template +template struct ForallDimensionCalculator; // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0 // there are specializations for named_usage::unspecified // but named_usage::ignored is not supported so no specializations are provided // and static_asserts in the general case catch unsupported values -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; @@ -144,8 +93,10 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct (static_cast(IndexGetter::block_size) * - static_cast(IndexGetter::grid_size)) ) { + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + if ( len > (block_size * grid_size) ) { RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); } @@ -154,160 +105,168 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - // BEWARE: if calculated block_size is too high then the kernel launch will fail - internal::set_hip_dim(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::grid_size))); - internal::set_hip_dim(dims.blocks, static_cast(IndexGetter::grid_size)); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size); + + if ( block_size == IdxT(0) ) { + RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space"); + } + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, - const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) + const void* func, size_t dynamic_shmem_size) { - internal::set_hip_dim(dims.threads, static_cast(IndexGetter::block_size)); - internal::set_hip_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexGetter::block_size))); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; + + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size); + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - internal::set_hip_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_hip_dim(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first))); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_len(); + + internal::set_hip_dim(dims.threads, sizes.first); + internal::set_hip_dim(dims.blocks, sizes.second); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len), const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size)) { - internal::set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall"); - static_assert(sizeof...(MappingModifiers) == 0, "MappingModifiers not supported in this configuration"); - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_block_size = std::min( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::grid_size)), - static_cast(max_sizes.first)); + const IdxT grid_size = static_cast(IndexGetter::grid_size); + const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size); - internal::set_hip_dim(dims.threads, calculated_block_size); - internal::set_hip_dim(dims.blocks, static_cast(IndexMapper::grid_size)); + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall"); - using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_grid_size = oc.get_max_grid_size(dynamic_shmem_size, - static_cast(IndexMapper::block_size)); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = GridStrideHelper::get_grid_size( - RAJA_DIVIDE_CEILING_INT(len, static_cast(IndexMapper::block_size)), - static_cast(max_grid_size)); + const IdxT block_size = static_cast(IndexGetter::block_size); + const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size); - internal::set_hip_dim(dims.threads, static_cast(IndexMapper::block_size)); - internal::set_hip_dim(dims.blocks, calculated_grid_size); + internal::set_hip_dim(dims.threads, block_size); + internal::set_hip_dim(dims.blocks, grid_size); } }; -template -struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, +template +struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop, ::RAJA::hip::IndexGlobal, + Concretizer, UniqueMarker> { - using IterationMapping = ::RAJA::iteration_mapping::StridedLoop; - using IndexMapper = ::RAJA::hip::IndexGlobal; + using IndexGetter = ::RAJA::hip::IndexGlobal; template < typename IdxT > static void set_dimensions(internal::HipDims& dims, IdxT len, const void* func, size_t dynamic_shmem_size) { - ::RAJA::hip::HipOccupancyCalculator oc(func); - auto max_sizes = oc.get_max_block_size_and_grid_size(dynamic_shmem_size); + ::RAJA::hip::ConcretizerImpl concretizer{func, dynamic_shmem_size, len}; - IdxT calculated_grid_size = GridStrideHelper::get_grid_size( - RAJA_DIVIDE_CEILING_INT(len, static_cast(max_sizes.first)), - static_cast(max_sizes.second)); + const auto sizes = concretizer.get_block_and_grid_size_to_fit_device(); - internal::set_hip_dim(dims.threads, static_cast(max_sizes.first)); - internal::set_hip_dim(dims.blocks, calculated_grid_size); + internal::set_hip_dim(dims.threads, sizes.first); + internal::set_hip_dim(dims.blocks, sizes.second); } }; @@ -551,7 +510,7 @@ void forallp_hip_kernel(LOOP_BODY loop_body, template RAJA_INLINE concepts::enable_if_t< @@ -559,7 +518,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> forall_impl(resources::Hip hip_res, - ::RAJA::policy::hip::hip_execconst&, + ::RAJA::policy::hip::hip_execconst&, Iterable&& iter, LoopBody&& loop_body, ForallParam) @@ -567,9 +526,9 @@ forall_impl(resources::Hip hip_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::hip::hip_exec; + using EXEC_POL = ::RAJA::policy::hip::hip_exec; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -620,7 +579,7 @@ forall_impl(resources::Hip hip_res, template RAJA_INLINE concepts::enable_if_t< @@ -628,7 +587,7 @@ concepts::enable_if_t< RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty> > forall_impl(resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec const&, + ::RAJA::policy::hip::hip_exec const&, Iterable&& iter, LoopBody&& loop_body, ForallParam f_params) @@ -636,9 +595,9 @@ forall_impl(resources::Hip hip_res, using Iterator = camp::decay; using LOOP_BODY = camp::decay; using IndexType = camp::decay; - using EXEC_POL = ::RAJA::policy::hip::hip_exec; + using EXEC_POL = ::RAJA::policy::hip::hip_exec; using UniqueMarker = ::camp::list; - using DimensionCalculator = impl::ForallDimensionCalculator; + using DimensionCalculator = impl::ForallDimensionCalculator; // // Compute the requested iteration space size @@ -716,11 +675,11 @@ forall_impl(resources::Hip hip_res, */ template RAJA_INLINE resources::EventProxy forall_impl(resources::Hip r, - ExecPolicy>, + ExecPolicy>, const TypedIndexSet& iset, LoopBody&& loop_body) { @@ -729,7 +688,7 @@ forall_impl(resources::Hip r, iset.segmentCall(r, isi, detail::CallForall(), - ::RAJA::policy::hip::hip_exec(), + ::RAJA::policy::hip::hip_exec(), loop_body); } // iterate over segments of index set diff --git a/include/RAJA/policy/hip/kernel.hpp b/include/RAJA/policy/hip/kernel.hpp index 678d48e3c1..4f907f5f5f 100644 --- a/include/RAJA/policy/hip/kernel.hpp +++ b/include/RAJA/policy/hip/kernel.hpp @@ -4,7 +4,7 @@ * \file * * \brief RAJA header file containing constructs used to run kernel::forall - * traversals on GPU with CUDA. + * traversals on GPU with HIP. * ****************************************************************************** */ diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp index 10563bc20e..848ea42edf 100644 --- a/include/RAJA/policy/hip/kernel/For.hpp +++ b/include/RAJA/policy/hip/kernel/For.hpp @@ -45,7 +45,7 @@ template , sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> { @@ -60,7 +60,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; using DimensionCalculator = RAJA::internal::KernelDimensionCalculator< - RAJA::policy::hip::hip_indexer, sync, IndexMapper>>; + RAJA::policy::hip::hip_indexer>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp index be7e256274..014b4db3ac 100644 --- a/include/RAJA/policy/hip/kernel/ForICount.hpp +++ b/include/RAJA/policy/hip/kernel/ForICount.hpp @@ -47,20 +47,20 @@ template , sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> : HipStatementExecutor< Data, statement::For, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> { using Base = HipStatementExecutor< Data, statement::For, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types>; diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp index 67bea1299a..68156600b2 100644 --- a/include/RAJA/policy/hip/kernel/HipKernel.hpp +++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp @@ -87,7 +87,7 @@ namespace statement */ template struct HipKernelExt - : public internal::Statement<::RAJA::policy::hip::hip_exec, EnclosedStmts...> { + : public internal::Statement<::RAJA::policy::hip::hip_exec, EnclosedStmts...> { }; @@ -263,7 +263,7 @@ struct HipLaunchHelper,Stmt inline static void recommended_blocks_threads(size_t shmem_size, int &recommended_blocks, int &recommended_threads) { - auto func = kernelGetter_t::get(); + auto func = reinterpret_cast(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -273,8 +273,10 @@ struct HipLaunchHelper,Stmt // determine blocks at runtime // determine threads at runtime // - ::RAJA::hip::hip_occupancy_max_blocks_threads( - func, shmem_size, recommended_blocks, recommended_threads); + auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_device; + recommended_threads = data.func_max_threads_per_block; } else { @@ -284,8 +286,9 @@ struct HipLaunchHelper,Stmt // recommended_threads = num_threads; - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, recommended_blocks); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size); + recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } @@ -339,7 +342,7 @@ struct HipLaunchHelper,Stmt inline static void max_blocks(size_t shmem_size, int &max_blocks, int actual_threads) { - auto func = kernelGetter_t::get(); + auto func = reinterpret_cast(kernelGetter_t::get()); if (num_blocks <= 0) { @@ -352,16 +355,18 @@ struct HipLaunchHelper,Stmt // // determine blocks when actual_threads != num_threads // - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, max_blocks, actual_threads); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size, actual_threads); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } else { // // determine blocks when actual_threads == num_threads // - ::RAJA::hip::hip_occupancy_max_blocks( - func, shmem_size, max_blocks); + auto data = ::RAJA::hip::hip_occupancy_max_blocks( + func, shmem_size); + max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device; } diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp index 51a199226f..62dda7f20d 100644 --- a/include/RAJA/policy/hip/kernel/Tile.hpp +++ b/include/RAJA/policy/hip/kernel/Tile.hpp @@ -58,7 +58,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> { @@ -69,7 +69,7 @@ struct HipStatementExecutor< using diff_t = segment_diff_type; - using DimensionCalculator = KernelDimensionCalculator, sync, IndexMapper>>; + using DimensionCalculator = KernelDimensionCalculator>; static inline RAJA_DEVICE void exec(Data &data, bool thread_active) diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp index 72e4114a23..07637fbd8f 100644 --- a/include/RAJA/policy/hip/kernel/TileTCount.hpp +++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp @@ -60,14 +60,14 @@ struct HipStatementExecutor< Data, statement::TileTCount, - RAJA::policy::hip::hip_indexer, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> : public HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types> { @@ -75,7 +75,7 @@ struct HipStatementExecutor< Data, statement::Tile, - RAJA::policy::hip::hip_indexer, sync, IndexMapper>, + RAJA::policy::hip::hip_indexer, EnclosedStmts...>, Types>; diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp index 1c520d4af9..aa0610d736 100644 --- a/include/RAJA/policy/hip/kernel/internal.hpp +++ b/include/RAJA/policy/hip/kernel/internal.hpp @@ -217,7 +217,7 @@ struct KernelDimensionCalculator; // specialization for direct sequential policies template -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -234,7 +234,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -250,7 +250,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -271,7 +271,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -286,7 +286,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -307,7 +307,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -323,7 +323,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -343,7 +343,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { @@ -362,7 +362,7 @@ struct KernelDimensionCalculator -struct KernelDimensionCalculator, +struct KernelDimensionCalculator>> { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 8f605cb538..76f592d20b 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -348,7 +348,7 @@ struct LaunchExecute> { HIP generic loop implementations */ template -struct LoopExecute, +struct LoopExecute, SEGMENT> { @@ -371,7 +371,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute, @@ -399,7 +399,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute -struct LoopICountExecute, +struct LoopICountExecute, SEGMENT> { @@ -560,7 +560,7 @@ struct LoopICountExecute -struct LoopICountExecute, +struct LoopICountExecute, @@ -590,7 +590,7 @@ struct LoopICountExecute -struct LoopICountExecute, +struct LoopICountExecute -struct LoopExecute, +struct LoopExecute, SEGMENT> - : LoopExecute, + : LoopExecute, SEGMENT> {}; template -struct LoopExecute, +struct LoopExecute, @@ -777,7 +777,7 @@ struct LoopExecute -struct LoopExecute, +struct LoopExecute -struct TileExecute, +struct TileExecute, SEGMENT> { @@ -939,7 +939,7 @@ struct TileExecute -struct TileTCountExecute, +struct TileTCountExecute, SEGMENT> { diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 49cd489be4..3ff0dd553f 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -74,6 +74,86 @@ struct IndexGlobal; template struct IndexFlatten; +/*! + * Use the max occupancy of a kernel on the current device when launch + * parameters are not fully determined. + * Note that the maximum occupancy of the kernel may be less than the maximum + * occupancy of the device in terms of total threads. + */ +struct MaxOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use a fraction and an offset of the max occupancy of a kernel on the current + * device when launch parameters are not fully determined. + * The following formula is used, with care to avoid zero, to determine the + * maximum grid size: + * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm + */ +template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +struct FractionOffsetOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + using Fraction = typename t_Fraction::template rebind; + + IdxT device_sm_per_device = data.device_sm_per_device; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + + if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) { + func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm); + } + + if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) { + func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET); + } + + IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device; + + return func_max_blocks_per_device; + } +}; + +/*! + * Use an occupancy that is less than the max occupancy of the device when + * launch parameters are not fully determined. + * Use the MaxOccupancyConcretizer if the maximum occupancy of the kernel is + * below the maximum occupancy of the device. + * Otherwise use the given AvoidMaxOccupancyCalculator to determine the + * maximum grid size. + */ +template < typename AvoidMaxOccupancyConcretizer > +struct AvoidDeviceMaxThreadOccupancyConcretizer +{ + template < typename IdxT, typename Data > + static IdxT get_max_grid_size(Data const& data) + { + IdxT device_max_threads_per_sm = data.device_max_threads_per_sm; + IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm; + IdxT func_threads_per_block = data.func_threads_per_block; + + IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm; + + if (func_max_threads_per_sm < device_max_threads_per_sm) { + return MaxOccupancyConcretizer::template get_max_grid_size(data); + } else { + return AvoidMaxOccupancyConcretizer::template get_max_grid_size(data); + } + } +}; + } // namespace hip namespace policy @@ -93,7 +173,8 @@ struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t< using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>; }; -template +template struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Policy::hip, RAJA::Pattern::forall, @@ -101,6 +182,7 @@ struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t< RAJA::Platform::hip> { using IterationMapping = _IterationMapping; using IterationGetter = _IterationGetter; + using LaunchConcretizer = _LaunchConcretizer; }; template @@ -816,6 +898,7 @@ struct IndexFlatten }; + // helper to get just the thread indexing part of IndexGlobal template < typename index_global > struct get_index_thread; @@ -876,44 +959,83 @@ using global_z = IndexGlobal; } // namespace hip +// contretizers used in forall, scan, and sort policies + +using HipDefaultAvoidMaxOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer, -1>; + +template < typename AvoidMaxOccupancyConcretizer > +using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer; + +template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > +using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer; + +using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer; + +using HipRecForReduceConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer; + +using HipDefaultConcretizer = hip::MaxOccupancyConcretizer; + // policies usable with forall, scan, and sort + template using hip_exec_grid = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, Async>; template using hip_exec_grid_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, true>; template using hip_exec = policy::hip::hip_exec< - iteration_mapping::Direct<>, hip::global_x, Async>; + iteration_mapping::Direct, hip::global_x, + HipDefaultConcretizer, Async>; template using hip_exec_async = policy::hip::hip_exec< - iteration_mapping::Direct<>, hip::global_x, true>; + iteration_mapping::Direct, hip::global_x, + HipDefaultConcretizer, true>; template using hip_exec_occ_calc = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; + iteration_mapping::StridedLoop, hip::global_x, + HipMaxOccupancyConcretizer, Async>; template using hip_exec_occ_calc_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; + iteration_mapping::StridedLoop, hip::global_x, + HipMaxOccupancyConcretizer, true>; template -using hip_exec_occ_calc_fraction = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, Async>; +using hip_exec_occ_fraction = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipFractionOffsetOccupancyConcretizer, Async>; template -using hip_exec_occ_calc_fraction_async = policy::hip::hip_exec< - iteration_mapping::StridedLoop, hip::global_x, true>; +using hip_exec_occ_fraction_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipFractionOffsetOccupancyConcretizer, true>; + +template +using hip_exec_occ_avoid_max = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>; + +template +using hip_exec_occ_avoid_max_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipAvoidDeviceMaxThreadOccupancyConcretizer, true>; template -using hip_exec_rec_for_reduce = hip_exec_occ_calc_fraction, Async>; +using hip_exec_rec_for_reduce = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipRecForReduceConcretizer, Async>; template -using hip_exec_rec_for_reduce_async = hip_exec_occ_calc_fraction_async>; +using hip_exec_rec_for_reduce_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipRecForReduceConcretizer, true>; // policies usable with WorkGroup using policy::hip::hip_work; @@ -937,7 +1059,7 @@ using policy::hip::hip_block_reduce; using policy::hip::hip_warp_reduce; using hip_warp_direct = RAJA::policy::hip::hip_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, hip::thread_x>; using hip_warp_loop = RAJA::policy::hip::hip_indexer< @@ -961,7 +1083,7 @@ using policy::hip::hip_launch_t; // policies usable with kernel and launch template < typename ... indexers > using hip_indexer_direct = policy::hip::hip_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, indexers...>; @@ -979,7 +1101,7 @@ using hip_indexer_syncable_loop = policy::hip::hip_indexer< template < typename ... indexers > using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer< - iteration_mapping::Direct<>, + iteration_mapping::Direct, kernel_sync_requirement::none, indexers...>; diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp index 40e44c2e19..cdf0a9b82d 100644 --- a/include/RAJA/policy/hip/scan.hpp +++ b/include/RAJA/policy/hip/scan.hpp @@ -49,6 +49,7 @@ namespace scan */ template @@ -56,7 +57,7 @@ RAJA_INLINE resources::EventProxy inclusive_inplace( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, Function binary_op) @@ -121,6 +122,7 @@ inclusive_inplace( */ template exclusive_inplace( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, Function binary_op, @@ -198,6 +200,7 @@ exclusive_inplace( */ template inclusive( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, OutputIter out, @@ -271,6 +274,7 @@ inclusive( */ template exclusive( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, InputIter begin, InputIter end, OutputIter out, diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp index a6918968c8..eb16246623 100644 --- a/include/RAJA/policy/hip/sort.hpp +++ b/include/RAJA/policy/hip/sort.hpp @@ -73,7 +73,9 @@ namespace detail /*! \brief static assert unimplemented stable sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -83,7 +85,7 @@ concepts::enable_if_t, camp::is_same>>>>>> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter, Iter, Compare) @@ -102,13 +104,15 @@ stable( /*! \brief stable sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter begin, Iter end, operators::less>) @@ -190,13 +194,15 @@ stable( /*! \brief stable sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> stable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter begin, Iter end, operators::greater>) @@ -279,7 +285,9 @@ stable( /*! \brief static assert unimplemented sort */ -template +template concepts::enable_if_t, concepts::negate>, @@ -289,7 +297,7 @@ concepts::enable_if_t, camp::is_same>>>>>> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, Iter, Iter, Compare) @@ -308,13 +316,15 @@ unstable( /*! \brief sort given range in ascending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, Iter begin, Iter end, operators::less> comp) @@ -325,13 +335,15 @@ unstable( /*! \brief sort given range in descending order */ -template +template concepts::enable_if_t, type_traits::is_arithmetic>, std::is_pointer> unstable( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, Iter begin, Iter end, operators::greater> comp) @@ -343,7 +355,8 @@ unstable( /*! \brief static assert unimplemented stable sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter, KeyIter, ValIter, @@ -379,7 +392,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -387,7 +401,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -483,7 +497,8 @@ stable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -491,7 +506,7 @@ concepts::enable_if_t, std::is_pointer> stable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -588,7 +603,8 @@ stable_pairs( /*! \brief static assert unimplemented sort pairs */ -template concepts::enable_if_t, concepts::negate, camp::is_same>>>>>> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec, + ::RAJA::policy::hip::hip_exec, KeyIter, KeyIter, ValIter, @@ -624,7 +640,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in ascending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -632,7 +649,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, @@ -644,7 +661,8 @@ unstable_pairs( /*! \brief stable sort given range of pairs in descending order of keys */ -template concepts::enable_if_t, type_traits::is_arithmetic>, @@ -652,7 +670,7 @@ concepts::enable_if_t, std::is_pointer> unstable_pairs( resources::Hip hip_res, - ::RAJA::policy::hip::hip_exec p, + ::RAJA::policy::hip::hip_exec p, KeyIter keys_begin, KeyIter keys_end, ValIter vals_begin, diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp index a54ce434a2..28a476d951 100644 --- a/include/RAJA/util/resource.hpp +++ b/include/RAJA/util/resource.hpp @@ -65,8 +65,9 @@ namespace RAJA using type = camp::resources::Cuda; }; - template - struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit>{ + template + struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit>{ using type = camp::resources::Cuda; }; @@ -75,8 +76,9 @@ namespace RAJA using type = camp::resources::Cuda; }; - template - struct get_resource>>{ + template + struct get_resource>>{ using type = camp::resources::Cuda; }; #endif @@ -87,8 +89,9 @@ namespace RAJA using type = camp::resources::Hip; }; - template - struct get_resource<::RAJA::policy::hip::hip_exec>{ + template + struct get_resource<::RAJA::policy::hip::hip_exec>{ using type = camp::resources::Hip; }; @@ -97,8 +100,9 @@ namespace RAJA using type = camp::resources::Hip; }; - template - struct get_resource>>{ + template + struct get_resource>>{ using type = camp::resources::Hip; }; #endif diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 95b139bce5..011082953d 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -100,7 +100,6 @@ struct SizedLoopSpecifyingBase : SizedLoopBase /// // 3 -> {3} /// // 4 -> {} /// -template < typename ... Modifiers > struct Direct : DirectBase {}; /// @@ -128,7 +127,7 @@ struct Direct : DirectBase {}; /// // 1 -> {3, 4, 5} /// // 2 -> {6, 7} /// -template < size_t max_iterations, typename ... Modifiers > +template < size_t max_iterations > struct Contiguousloop : ContiguousLoopBase, std::conditional_t<(max_iterations != named_usage::unspecified), SizedLoopSpecifyingBase, UnsizedLoopBase> {}; @@ -158,7 +157,7 @@ struct Contiguousloop : ContiguousLoopBase, /// // 1 -> {1, 4, 7} /// // 2 -> {2, 5} /// -template < size_t max_iterations, typename ... Modifiers > +template < size_t max_iterations > struct StridedLoop : StridedLoopBase, std::conditional_t<(max_iterations != named_usage::unspecified), SizedLoopSpecifyingBase, UnsizedLoopBase> {}; @@ -201,6 +200,9 @@ struct Fraction using inverse = Fraction; + template < typename new_int_t > + using rebind = Fraction; + static constexpr int_t multiply(int_t val) noexcept { return (val / denominator) * numerator + diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index 458e6d06d0..f09ff71182 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -109,7 +109,8 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, RAJA::cuda_exec_occ_calc<256>, RAJA::cuda_exec_grid<256, 64>, RAJA::cuda_exec_explicit<256,2>, - RAJA::cuda_exec_occ_calc_fraction<256, RAJA::Fraction> >; + RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction>, + RAJA::cuda_exec_occ_avoid_max<256> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -121,7 +122,8 @@ using CudaForallAtomicExecPols = CudaForallExecPols; using HipForallExecPols = camp::list< RAJA::hip_exec<128>, RAJA::hip_exec_occ_calc<256>, RAJA::hip_exec_grid<256, 64>, - RAJA::hip_exec_occ_calc_fraction<256, RAJA::Fraction> >; + RAJA::hip_exec_occ_fraction<256, RAJA::Fraction>, + RAJA::hip_exec_occ_avoid_max<256> >; using HipForallReduceExecPols = HipForallExecPols; From f0bdae976c32aebc6ac7fae3fd54e2032cb02a5f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 14:09:34 -0700 Subject: [PATCH 025/108] Simplify cuda_exec_occ_avoid_max --- include/RAJA/policy/cuda/policy.hpp | 25 +++++++++++-------------- include/RAJA/policy/hip/policy.hpp | 17 +++++++---------- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index f8ec6773c8..20c0a7a4de 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -966,19 +966,16 @@ using global_z = IndexGlobal; // contretizers used in forall, scan, and sort policies -using CudaDefaultAvoidMaxOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer, -1>; - -template < typename AvoidMaxOccupancyConcretizer > -using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer; +using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer, -1>>; template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer; using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer; -using CudaRecForReduceConcretizer = cuda::MaxOccupancyConcretizer; +using CudaRecForReduceConcretizer = CudaMaxOccupancyConcretizer; -using CudaDefaultConcretizer = cuda::MaxOccupancyConcretizer; +using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer; // policies usable with forall, scan, and sort @@ -1062,25 +1059,25 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; -template +template using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>; + CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>; -template +template using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>; + CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>; -template +template using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; -template +template using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit< diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 3ff0dd553f..aa356c132f 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -961,19 +961,16 @@ using global_z = IndexGlobal; // contretizers used in forall, scan, and sort policies -using HipDefaultAvoidMaxOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer, -1>; - -template < typename AvoidMaxOccupancyConcretizer > -using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer; +using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer, -1>>; template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET > using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer; using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer; -using HipRecForReduceConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer; +using HipRecForReduceConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; -using HipDefaultConcretizer = hip::MaxOccupancyConcretizer; +using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; // policies usable with forall, scan, and sort @@ -1017,15 +1014,15 @@ using hip_exec_occ_fraction_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipFractionOffsetOccupancyConcretizer, true>; -template +template using hip_exec_occ_avoid_max = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>; + HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>; -template +template using hip_exec_occ_avoid_max_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipAvoidDeviceMaxThreadOccupancyConcretizer, true>; + HipAvoidDeviceMaxThreadOccupancyConcretizer, true>; template using hip_exec_rec_for_reduce = policy::hip::hip_exec< From c44e06bed5439194d060d104f7e4e57c22422dc1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 14:50:41 -0700 Subject: [PATCH 026/108] Use 1/2 occupancy as HipRecForReduceConcretizer --- include/RAJA/policy/hip/policy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index aa356c132f..304fa55c32 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -968,7 +968,7 @@ using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcre using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer; -using HipRecForReduceConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; +using HipRecForReduceConcretizer = HipFractionOffsetOccupancyConcretizer, 0>; using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; From 69e8bd70fd776b172e91fc0041ea9d1ab75d5124 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 15:06:36 -0700 Subject: [PATCH 027/108] CHanges to occ policies occ_calc now uses the default (may not be max) occ_max added to use max occ_custom added for using whatever concretizer you'd like occ_avoid_max removed --- include/RAJA/policy/cuda/policy.hpp | 50 ++++++++++++++++++++--------- include/RAJA/policy/hip/policy.hpp | 24 ++++++++++---- 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 20c0a7a4de..e7a72b2be7 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1022,20 +1022,40 @@ using cuda_exec_async = policy::cuda::cuda_exec_explicit< template using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>; + CudaDefaultConcretizer, BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>; + CudaDefaultConcretizer, BLOCKS_PER_SM, true>; template using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>; + +template +using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit< + iteration_mapping::StridedLoop, cuda::global_x, + CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + +template +using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; @@ -1059,25 +1079,25 @@ using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaFractionOffsetOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; -template -using cuda_exec_occ_avoid_max_explicit = policy::cuda::cuda_exec_explicit< +template +using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, Async>; + Concretizer, BLOCKS_PER_SM, Async>; -template -using cuda_exec_occ_avoid_max_explicit_async = policy::cuda::cuda_exec_explicit< +template +using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, BLOCKS_PER_SM, true>; + Concretizer, BLOCKS_PER_SM, true>; -template -using cuda_exec_occ_avoid_max = policy::cuda::cuda_exec_explicit< +template +using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; -template -using cuda_exec_occ_avoid_max_async = policy::cuda::cuda_exec_explicit< +template +using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaAvoidDeviceMaxThreadOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit< diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 304fa55c32..65c87ff203 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -997,10 +997,20 @@ using hip_exec_async = policy::hip::hip_exec< template using hip_exec_occ_calc = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipMaxOccupancyConcretizer, Async>; + HipDefaultConcretizer, Async>; template using hip_exec_occ_calc_async = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipDefaultConcretizer, true>; + +template +using hip_exec_occ_max = policy::hip::hip_exec< + iteration_mapping::StridedLoop, hip::global_x, + HipMaxOccupancyConcretizer, Async>; + +template +using hip_exec_occ_max_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipMaxOccupancyConcretizer, true>; @@ -1014,15 +1024,15 @@ using hip_exec_occ_fraction_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipFractionOffsetOccupancyConcretizer, true>; -template -using hip_exec_occ_avoid_max = policy::hip::hip_exec< +template +using hip_exec_occ_custom = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipAvoidDeviceMaxThreadOccupancyConcretizer, Async>; + Concretizer, Async>; -template -using hip_exec_occ_avoid_max_async = policy::hip::hip_exec< +template +using hip_exec_occ_custom_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipAvoidDeviceMaxThreadOccupancyConcretizer, true>; + Concretizer, true>; template using hip_exec_rec_for_reduce = policy::hip::hip_exec< From b58c675461aa7e0cd3459d4ed91cc3b3fec2649d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 15:42:04 -0700 Subject: [PATCH 028/108] Add simple cook book with reduction example --- docs/sphinx/user_guide/cook_book.rst | 24 ++++++ .../sphinx/user_guide/cook_book/reduction.rst | 78 +++++++++++++++++++ docs/sphinx/user_guide/feature/reduction.rst | 4 + docs/sphinx/user_guide/index.rst | 1 + 4 files changed, 107 insertions(+) create mode 100644 docs/sphinx/user_guide/cook_book.rst create mode 100644 docs/sphinx/user_guide/cook_book/reduction.rst diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst new file mode 100644 index 0000000000..44c89c3d51 --- /dev/null +++ b/docs/sphinx/user_guide/cook_book.rst @@ -0,0 +1,24 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _cook-book-label: + +************************ +RAJA Cook Book +************************ + +The following sections show common use case patterns and the recommended +RAJA features and policies to use with them. They are intended +for users to copy and paste into their code and provide guidance on +which policy to use with each backend to get good performance. + +.. toctree:: + :maxdepth: 2 + + cook_book/reduction + diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst new file mode 100644 index 0000000000..309561bc38 --- /dev/null +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -0,0 +1,78 @@ +.. ## +.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +.. ## and other RAJA project contributors. See the RAJA/LICENSE file +.. ## for details. +.. ## +.. ## SPDX-License-Identifier: (BSD-3-Clause) +.. ## + +.. _cook-book-reductions-label: + +======================= +Cooking with Reductions +======================= + +Please see the following section for more info on RAJA reductions: + + * :ref:`feat-reductions-label`. + + +---------------------------- +Reductions with RAJA::forall +---------------------------- + +Here is the setup for a simple reduction example:: + + const int N = 1000; + + int vec[N]; + + for (int i = 0; i < N; ++i) { + + vec[i] = 1; + + } + +Here a simple sum reduction is performed in a for loop:: + + int vsum = 0; + + // Run a kernel using the reduction objects + for (int i = 0; i < N; ++i) { + + vsum += vec[i]; + + } + +The results of these operations will yield the following values: + + * vsum == 1000 + +Here a simple sum reduction is performed using RAJA:: + + using reduce_policy = RAJA::seq_reduce; + // using reduce_policy = RAJA::omp_reduce; + // using reduce_policy = RAJA::omp_target_reduce; + // using reduce_policy = RAJA::cuda_reduce; + // using reduce_policy = RAJA::hip_reduce; + // using reduce_policy = RAJA::sycl_reduce; + + using exec_policy = RAJA::seq_exec; + // using exec_policy = RAJA::omp_parallel_for_exec; + // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; + // using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>; + // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>; + // using exec_policy = RAJA::sycl_exec<256>; + + RAJA::ReduceSum vsum(0); + + RAJA::forall( RAJA::RangeSegment(0, N), + [=](RAJA::Index_type i) { + + vsum += vec[i]; + + }); + +The results of these operations will yield the following values: + + * vsum.get() == 1000 diff --git a/docs/sphinx/user_guide/feature/reduction.rst b/docs/sphinx/user_guide/feature/reduction.rst index 8643e4a225..5f2f09afad 100644 --- a/docs/sphinx/user_guide/feature/reduction.rst +++ b/docs/sphinx/user_guide/feature/reduction.rst @@ -39,6 +39,10 @@ RAJA reductions: * :ref:`tut-reduction-label`. +Please see the following cook book sections for guidance on policy usage: + + * :ref:`cook-book-reductions-label`. + ---------------- Reduction Types diff --git a/docs/sphinx/user_guide/index.rst b/docs/sphinx/user_guide/index.rst index f2fb6ca46d..f73f4d9449 100644 --- a/docs/sphinx/user_guide/index.rst +++ b/docs/sphinx/user_guide/index.rst @@ -32,5 +32,6 @@ to use RAJA in an application can be found in :ref:`app-considerations-label`. using_raja config_options features + cook_book app_considerations tutorial From fed1838d851f3fe39b9fe3e7d33a499e0c1e184a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 5 Apr 2024 16:30:13 -0700 Subject: [PATCH 029/108] Add user guide documentation of the new policies --- docs/sphinx/user_guide/feature/policies.rst | 55 ++++++++++++++++++--- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index d3f982951a..ad1196237d 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -257,7 +257,7 @@ policies have the prefix ``hip_``. Note that the thread-block size and grid size must be provided, there is no default. - cuda/hip_exec_occ_calc forall Execute loop iterations + cuda/hip_exec_occ_max forall Execute loop iterations mapped to global threads via grid striding with multiple iterations per global thread @@ -265,12 +265,20 @@ policies have the prefix ``hip_``. with given thread-block size and grid size bounded by the maximum occupancy of - the kernel. Note that the - thread-block size must - be provided, there is no - default. Note this can improve - reducer performance in kernels - with large iteration counts. + the kernel. + cuda/hip_exec_occ_calc forall Similar to the occ_max + policy but may use less + than the maximum occupancy + of the kernel for performance + reasons. + cuda/hip_exec_occ_fraction> of the maximum occupancy + of the kernel. + cuda/hip_exec_occ_custom policy but the grid size + is determined by the + concretizer. cuda/hip_exec_rec_for_reduce forall The cuda/hip exec policy that is recommended for use with reducers. @@ -414,6 +422,39 @@ policies have the prefix ``hip_``. thread warp. ========================================= ============= ======================================= +When a cuda/hip policy leaves parameters like the block size and/or grid size +unspecified a concretizer object is used to decide those parameters. The +following concretizers are available to use in the cuda/hip_exec_occ_custom +policies: + +=================================================== ========================================= +Execution Policy Brief description +=================================================== ========================================= + +Cuda/HipDefaultConcretizer The default concretizer, expected to + provide good performance in general. + Note that it may not use max occupancy. + +Cuda/HipRecForReduceConcretizer Expected to provide good performance + in loops with reducers. + Note that it may not use max occupancy. + +Cuda/HipMaxOccupancyConcretizer Uses max occupancy. + +Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer Avoids using the max occupancy of the + device in terms of threads. + Note that it may use the max occupancy + of the function if that is below the max + occupancy of the device. + +Cuda/HipFractionOffsetOccupancyConcretizer< Uses a fraction and offset to choose an + Fraction, occupancy based on the max occupancy + BLOCKS_PER_SM_OFFSET> Using the following formula. + (Fraction * kernel_max_blocks_per_sm + + BLOCKS_PER_SM_OFFSET) * sm_per_device + +=================================================== ========================================= + Several notable constraints apply to RAJA CUDA/HIP *direct* policies. .. note:: * Repeating direct policies with the same dimension in perfectly From 9e698bbe553d5560762db439b1573225e6d28115 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 6 Apr 2024 14:22:17 -0700 Subject: [PATCH 030/108] Change policy in tests --- test/include/RAJA_test-forall-execpol.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/include/RAJA_test-forall-execpol.hpp b/test/include/RAJA_test-forall-execpol.hpp index f09ff71182..40adaccc8c 100644 --- a/test/include/RAJA_test-forall-execpol.hpp +++ b/test/include/RAJA_test-forall-execpol.hpp @@ -110,7 +110,7 @@ using CudaForallExecPols = camp::list< RAJA::cuda_exec<128>, RAJA::cuda_exec_grid<256, 64>, RAJA::cuda_exec_explicit<256,2>, RAJA::cuda_exec_occ_fraction<256, RAJA::Fraction>, - RAJA::cuda_exec_occ_avoid_max<256> >; + RAJA::cuda_exec_occ_custom<256, RAJA::CudaAvoidDeviceMaxThreadOccupancyConcretizer> >; using CudaForallReduceExecPols = CudaForallExecPols; @@ -123,7 +123,7 @@ using HipForallExecPols = camp::list< RAJA::hip_exec<128>, RAJA::hip_exec_occ_calc<256>, RAJA::hip_exec_grid<256, 64>, RAJA::hip_exec_occ_fraction<256, RAJA::Fraction>, - RAJA::hip_exec_occ_avoid_max<256> >; + RAJA::hip_exec_occ_custom<256, RAJA::HipAvoidDeviceMaxThreadOccupancyConcretizer> >; using HipForallReduceExecPols = HipForallExecPols; From aae648ccce6c3edd53700d83e51a0b85cb3674ed Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:08:29 +0200 Subject: [PATCH 031/108] From RSC: add CARE --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index b09f869f9d..078498cdfc 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit b09f869f9d9aff6ecf6544a0161d96c2b18d13b8 +Subproject commit 078498cdfcc5b6024ff44964d4032a5ad5793a2f From 08f1ee0fd08ccbd15d7036b7b7f8b61827afbb36 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Thu, 11 Apr 2024 19:01:37 +0200 Subject: [PATCH 032/108] From RSC: Fix merge with CARE package --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 078498cdfc..5dfa405e08 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 078498cdfcc5b6024ff44964d4032a5ad5793a2f +Subproject commit 5dfa405e0883e5177ee96d4995cd57be4b254d8f From b865799f7756878d18a002c1cc3b7816efaeeb67 Mon Sep 17 00:00:00 2001 From: "Adrien M. BERNEDE" <51493078+adrienbernede@users.noreply.github.com> Date: Mon, 15 Apr 2024 11:55:49 +0200 Subject: [PATCH 033/108] Point at RADIUSS Spack Configs @ main --- scripts/radiuss-spack-configs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs index 5dfa405e08..a8d22367e0 160000 --- a/scripts/radiuss-spack-configs +++ b/scripts/radiuss-spack-configs @@ -1 +1 @@ -Subproject commit 5dfa405e0883e5177ee96d4995cd57be4b254d8f +Subproject commit a8d22367e03d4c9c180a11886414430bdf6428a8 From db437d23011957565a22d58deaa64f4a1717738c Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Mon, 15 Apr 2024 17:47:40 +0200 Subject: [PATCH 034/108] Update .gitlab/custom-jobs-and-variables.yml Co-authored-by: Rich Hornung --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index f652bb2caf..da32e89a77 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -49,7 +49,7 @@ variables: # Arguments for job level allocation TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona - PROJECT_TIOGA_VARIANTS: "~shared ~openmp +vectorization +tests" + PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for corona PROJECT_TIOGA_DEPS: "^blt@develop " From e7fd18cfea5467f27ada643cdb7cced1e0df3937 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 09:42:35 -0700 Subject: [PATCH 035/108] bug fixes for the bump style allocator --- include/RAJA/pattern/launch/launch_core.hpp | 8 ++++++-- .../shared_mem/tests/test-launch-DynamicMem.hpp | 12 ++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 6f56f4ed65..4a2f6c222a 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -174,10 +174,14 @@ class LaunchContext template RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes) { - T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset]; + + //Calculate offset in bytes with a char pointer + char* mem_ptr = (char*) shared_mem_ptr + shared_mem_offset; shared_mem_offset += bytes*sizeof(T); - return mem_ptr; + + //convert to desired type + return (T *) mem_ptr; } /* diff --git a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp index cdb8940256..8da7b81eb7 100644 --- a/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp +++ b/test/functional/launch/shared_mem/tests/test-launch-DynamicMem.hpp @@ -36,12 +36,16 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) for(s_type b=0; b (RAJA::LaunchParams(RAJA::Teams(RAJA::stripIndexType(block_range)), RAJA::Threads(RAJA::stripIndexType(thread_range)), shared_mem_size), @@ -52,7 +56,11 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) INDEX_TYPE * tile_ptr = ctx.getSharedMemory(RAJA::stripIndexType(thread_range)); RAJA::View> Tile(tile_ptr, RAJA::stripIndexType(thread_range)); + int * int_tile_ptr = ctx.getSharedMemory(RAJA::stripIndexType(thread_range)); + RAJA::View> Int_Tile(int_tile_ptr, RAJA::stripIndexType(thread_range)); + RAJA::loop(ctx, inner_range, [&](INDEX_TYPE tid) { + Int_Tile(RAJA::stripIndexType(tid)) = RAJA::stripIndexType(tid); Tile(RAJA::stripIndexType(thread_range)-RAJA::stripIndexType(tid)-1) = thread_range-tid-1 + thread_range*bid; }); @@ -60,7 +68,7 @@ void LaunchDynamicMemTestImpl(INDEX_TYPE block_range, INDEX_TYPE thread_range) RAJA::loop(ctx, inner_range, [&](INDEX_TYPE tid) { INDEX_TYPE idx = tid + thread_range * bid; - working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)); + working_array[RAJA::stripIndexType(idx)] = Tile(RAJA::stripIndexType(tid)) + Int_Tile(RAJA::stripIndexType(tid)); }); ctx.releaseSharedMemory(); From dd7b78f267c2495551075fb6bb56fd77d1085c4d Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 12:51:50 -0700 Subject: [PATCH 036/108] Update include/RAJA/pattern/launch/launch_core.hpp Co-authored-by: Rich Hornung --- include/RAJA/pattern/launch/launch_core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 4a2f6c222a..f03ad1e075 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -176,7 +176,7 @@ class LaunchContext { //Calculate offset in bytes with a char pointer - char* mem_ptr = (char*) shared_mem_ptr + shared_mem_offset; + char* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; shared_mem_offset += bytes*sizeof(T); From bb0eaa5c8be3dfefaf8ca849ff61a6c548c76639 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 12:51:54 -0700 Subject: [PATCH 037/108] Update include/RAJA/pattern/launch/launch_core.hpp Co-authored-by: Rich Hornung --- include/RAJA/pattern/launch/launch_core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index f03ad1e075..727f9b064c 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -181,7 +181,7 @@ class LaunchContext shared_mem_offset += bytes*sizeof(T); //convert to desired type - return (T *) mem_ptr; + return static_cast(mem_ptr); } /* From 4b846d6a0fd2e7914c5fd650237b8eb119fe95f3 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 15 Apr 2024 13:21:16 -0700 Subject: [PATCH 038/108] char -> void --- include/RAJA/pattern/launch/launch_core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 727f9b064c..213c435236 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -176,7 +176,7 @@ class LaunchContext { //Calculate offset in bytes with a char pointer - char* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; + void* mem_ptr = static_cast(shared_mem_ptr) + shared_mem_offset; shared_mem_offset += bytes*sizeof(T); From 53e6feb6bbc1a0a76384d147294da66f410cf3b7 Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:35:05 +0200 Subject: [PATCH 039/108] Use new pci queue on tioga --- .gitlab/custom-jobs-and-variables.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index da32e89a77..b869af6f50 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -45,9 +45,9 @@ variables: # Tioga # Arguments for top level allocation - TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2" + TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--queue=pci --nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for corona From 3d79e968348cf63a922c4ead633902611560f8ff Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Tue, 16 Apr 2024 13:03:13 +0200 Subject: [PATCH 040/108] pci queue not recognized in sub-job --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index b869af6f50..e6da7cecbf 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -47,7 +47,7 @@ variables: # Arguments for top level allocation TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2" # Arguments for job level allocation - TIOGA_JOB_ALLOC: "--queue=pci --nodes=1 --begin-time=+5s" + TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s" # Project specific variants for corona PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests" # Project specific deps for corona From 3385e0f070963f4f741b253d6b8c89207c0a9a44 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Apr 2024 16:13:06 -0700 Subject: [PATCH 041/108] Add more documentation on the exec and reduce policies to the reductioun cookbook --- .../sphinx/user_guide/cook_book/reduction.rst | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index 309561bc38..5c17e3a626 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -48,14 +48,13 @@ The results of these operations will yield the following values: * vsum == 1000 -Here a simple sum reduction is performed using RAJA:: +RAJA uses policy types to specify how things are implemented. - using reduce_policy = RAJA::seq_reduce; - // using reduce_policy = RAJA::omp_reduce; - // using reduce_policy = RAJA::omp_target_reduce; - // using reduce_policy = RAJA::cuda_reduce; - // using reduce_policy = RAJA::hip_reduce; - // using reduce_policy = RAJA::sycl_reduce; +The forall execution policy specifies how the loop is run in the forall. +For example ``RAJA::seq_exec`` runs a c-style for loop. The +``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a cuda kernel with +256 threads per block and other cuda kernel launch parameters, like the +number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; // using exec_policy = RAJA::omp_parallel_for_exec; @@ -64,6 +63,22 @@ Here a simple sum reduction is performed using RAJA:: // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>; // using exec_policy = RAJA::sycl_exec<256>; +The reduction policy specifies how the reduction is done and must match the +execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction +and can only be used with sequential execution policies. The +``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given +data type, and can only be used with cuda execution policies.:: + + using reduce_policy = RAJA::seq_reduce; + // using reduce_policy = RAJA::omp_reduce; + // using reduce_policy = RAJA::omp_target_reduce; + // using reduce_policy = RAJA::cuda_reduce_atomic; + // using reduce_policy = RAJA::hip_reduce_atomic; + // using reduce_policy = RAJA::sycl_reduce; + + +Here a simple sum reduction is performed using RAJA:: + RAJA::ReduceSum vsum(0); RAJA::forall( RAJA::RangeSegment(0, N), From c8ba75e068180f2b287363bd1250919834491253 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Apr 2024 16:17:00 -0700 Subject: [PATCH 042/108] Add more explanation of rec_for_reduce policy --- docs/sphinx/user_guide/feature/policies.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index ad1196237d..5a0670a657 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -269,6 +269,7 @@ policies have the prefix ``hip_``. cuda/hip_exec_occ_calc forall Similar to the occ_max policy but may use less than the maximum occupancy + determined by the occupancy calculator of the kernel for performance reasons. cuda/hip_exec_occ_fraction forall The cuda/hip exec policy that is recommended for - use with reducers. + use with reducers. In general using + the occupancy calculator policies + are better but exactly how much + occupancy to use differs by platform + so this policy provides a simple way + to get what works best for that platform + without having to know the details. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed From b966a222357ff17820af1639043e627f9670a610 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Apr 2024 16:19:13 -0700 Subject: [PATCH 043/108] Improve cuda/hip concretizer docs --- docs/sphinx/user_guide/feature/policies.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 5a0670a657..3554873a08 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -431,7 +431,7 @@ policies have the prefix ``hip_``. When a cuda/hip policy leaves parameters like the block size and/or grid size unspecified a concretizer object is used to decide those parameters. The -following concretizers are available to use in the cuda/hip_exec_occ_custom +following concretizers are available to use in the ``cuda/hip_exec_occ_custom`` policies: =================================================== ========================================= @@ -451,12 +451,12 @@ Cuda/HipMaxOccupancyConcretizer Uses max occupancy. Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer Avoids using the max occupancy of the device in terms of threads. Note that it may use the max occupancy - of the function if that is below the max + of the kernel if that is below the max occupancy of the device. Cuda/HipFractionOffsetOccupancyConcretizer< Uses a fraction and offset to choose an Fraction, occupancy based on the max occupancy - BLOCKS_PER_SM_OFFSET> Using the following formula. + BLOCKS_PER_SM_OFFSET> Using the following formula: (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * sm_per_device From bad690723e2e5468d261b4778c084cc3541aa9f9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 16 Apr 2024 16:45:43 -0700 Subject: [PATCH 044/108] Improve docs in MemUtils_CUDA/HIP --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 73 ++++++++++++++-------- include/RAJA/policy/hip/MemUtils_HIP.hpp | 51 +++++++++------ 2 files changed, 78 insertions(+), 46 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 7eee19dacf..54a8a7e008 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -279,6 +279,7 @@ RAJA_INLINE typename std::remove_reference::type make_launch_body( return return_type(std::forward(loop_body)); } +//! Get the properties of the current device RAJA_INLINE cudaDeviceProp get_device_prop() { @@ -289,6 +290,7 @@ cudaDeviceProp get_device_prop() return prop; } +//! Get a cached copy of the device properties RAJA_INLINE cudaDeviceProp& device_prop() { @@ -297,12 +299,14 @@ cudaDeviceProp& device_prop() } +//! Struct with the maximum theoretical occupancy of the device struct CudaFixedMaxBlocksData { int device_sm_per_device; int device_max_threads_per_sm; }; +//! Get the maximum theoretical occupancy of the device RAJA_INLINE CudaFixedMaxBlocksData cuda_max_blocks() { @@ -313,6 +317,7 @@ CudaFixedMaxBlocksData cuda_max_blocks() return data; } +//! Struct with the maximum occupancy of a kernel in simple terms struct CudaOccMaxBlocksThreadsData { size_t func_dynamic_shmem_per_block; @@ -320,15 +325,18 @@ struct CudaOccMaxBlocksThreadsData int func_max_threads_per_block; }; +//! Get the maximum occupancy of a kernel with unknown threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func, size_t func_dynamic_shmem_per_block) { + static constexpr int uninitialized_int = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksThreadsData data { - std::numeric_limits::max(), - -1, - -1 }; + uninitialized_size_t, + uninitialized_int, + uninitialized_int }; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { @@ -342,6 +350,7 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func, return data; } +//! Struct with the maximum occupancy of a kernel in specific terms struct CudaOccMaxBlocksData { size_t func_dynamic_shmem_per_block; @@ -351,17 +360,20 @@ struct CudaOccMaxBlocksData int func_max_blocks_per_sm; }; +//! Get the maximum occupancy of a kernel with compile time threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block) { + static constexpr int uninitialized_int = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksData data { - std::numeric_limits::max(), + uninitialized_size_t, func_threads_per_block, cuda::device_prop().multiProcessorCount, cuda::device_prop().maxThreadsPerMultiProcessor, - -1 }; + uninitialized_int }; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { @@ -375,17 +387,20 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, return data; } +//! Get the maximum occupancy of a kernel with runtime threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block, int func_threads_per_block) { + static constexpr int uninitialized_int = -1; + static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); static thread_local CudaOccMaxBlocksData data { - std::numeric_limits::max(), - -1, + uninitialized_size_t, + uninitialized_int, cuda::device_prop().multiProcessorCount, cuda::device_prop().maxThreadsPerMultiProcessor, - -1 }; + uninitialized_int }; if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || data.func_threads_per_block != func_threads_per_block ) { @@ -401,17 +416,31 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, return data; } + /*! ****************************************************************************** * - * \brief Cuda Concretizer Implementation. + * \brief Concretizer Implementation that chooses block size and/or grid + * size when they has not been specified at compile time. * * \tparam IdxT Index type to use for integer calculations. - * \tparam Concretizer Class the determines the max number of blocks to use when - * fitting for the device. + * \tparam Concretizer Class that determines the max number of blocks to use + * when fitting for the device. * \tparam UniqueMarker A type that is unique to each global function, used to * help cache the occupancy data for that global function. * + * The methods come in two flavors: + * - The fit_len methods choose grid and block sizes that result in a total + * number of threads of at least the len given in the constructor or 0 if + * that is not possible. + * - The fit_device methods choose grid and block sizes that best fit the + * occupancy of the global function according to the occupancy calculator and + * the Concretizer class. + * + * Common terms: + * - block size - threads per block + * - grid size - blocks per device + * ****************************************************************************** */ template < typename IdxT, typename Concretizer, typename UniqueMarker> @@ -423,7 +452,6 @@ struct ConcretizerImpl , m_len(len) { } - // Get the maximum block size IdxT get_max_block_size() const { auto data = cuda_occupancy_max_blocks_threads( @@ -432,8 +460,7 @@ struct ConcretizerImpl return func_max_threads_per_block; } - // Get a block size that combined with the given grid size is large enough - // to do len work, or 0 if not possible + //! Get a block size when grid size is specified IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -445,16 +472,14 @@ struct ConcretizerImpl } } - // Get a grid size that combined with the given block size is large enough - // to do len work + //! Get a grid size when block size is specified IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const { IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); return func_blocks_per_device; } - // Get a block size and grid size that combined is large enough - // to do len work + //! Get a block size and grid size when neither is specified auto get_block_and_grid_size_to_fit_len() const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -463,9 +488,7 @@ struct ConcretizerImpl func_blocks_per_device); } - // Get a block size that combined with the given grid size is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a block size when grid size is specified IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -473,9 +496,7 @@ struct ConcretizerImpl return std::min(func_threads_per_block, func_max_threads_per_block); } - // Get a grid size that combined with the given block size is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a grid size when block size is specified IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { auto data = cuda_occupancy_max_blocks( @@ -485,9 +506,7 @@ struct ConcretizerImpl return std::min(func_blocks_per_device, func_max_blocks_per_device); } - // Get a block size and grid size that combined is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a block size and grid size when neither is specified auto get_block_and_grid_size_to_fit_device() const { IdxT func_max_threads_per_block = this->get_max_block_size(); diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 9b8442637b..bfb07bc569 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -281,6 +281,7 @@ RAJA_INLINE typename std::remove_reference::type make_launch_body( return return_type(std::forward(loop_body)); } +//! Get the properties of the current device RAJA_INLINE hipDeviceProp_t get_device_prop() { @@ -291,6 +292,7 @@ hipDeviceProp_t get_device_prop() return prop; } +//! Get a cached copy of the device properties RAJA_INLINE hipDeviceProp_t& device_prop() { @@ -299,12 +301,14 @@ hipDeviceProp_t& device_prop() } +//! Struct with the maximum theoretical occupancy of the device struct HipFixedMaxBlocksData { int device_sm_per_device; int device_max_threads_per_sm; }; +//! Get the maximum theoretical occupancy of the device RAJA_INLINE HipFixedMaxBlocksData hip_max_blocks() { @@ -315,6 +319,7 @@ HipFixedMaxBlocksData hip_max_blocks() return data; } +//! Struct with the maximum occupancy of a kernel in simple terms struct HipOccMaxBlocksThreadsData { size_t func_dynamic_shmem_per_block; @@ -322,6 +327,7 @@ struct HipOccMaxBlocksThreadsData int func_max_threads_per_block; }; +//! Get the maximum occupancy of a kernel with unknown threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, @@ -351,6 +357,7 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, return data; } +//! Struct with the maximum occupancy of a kernel in specific terms struct HipOccMaxBlocksData { size_t func_dynamic_shmem_per_block; @@ -360,6 +367,7 @@ struct HipOccMaxBlocksData int func_max_blocks_per_sm; }; +//! Get the maximum occupancy of a kernel with compile time threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block > RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, @@ -391,6 +399,7 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, return data; } +//! Get the maximum occupancy of a kernel with runtime threads per block template < typename RAJA_UNUSED_ARG(UniqueMarker) > RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, @@ -423,17 +432,31 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, return data; } + /*! ****************************************************************************** * - * \brief Hip Concretizer Implementation. + * \brief Concretizer Implementation that chooses block size and/or grid + * size when they has not been specified at compile time. * * \tparam IdxT Index type to use for integer calculations. - * \tparam Concretizer Class the determines the max number of blocks to use when - * fitting for the device. + * \tparam Concretizer Class that determines the max number of blocks to use + * when fitting for the device. * \tparam UniqueMarker A type that is unique to each global function, used to * help cache the occupancy data for that global function. * + * The methods come in two flavors: + * - The fit_len methods choose grid and block sizes that result in a total + * number of threads of at least the len given in the constructor or 0 if + * that is not possible. + * - The fit_device methods choose grid and block sizes that best fit the + * occupancy of the global function according to the occupancy calculator and + * the Concretizer class. + * + * Common terms: + * - block size - threads per block + * - grid size - blocks per device + * ****************************************************************************** */ template < typename IdxT, typename Concretizer, typename UniqueMarker> @@ -445,7 +468,6 @@ struct ConcretizerImpl , m_len(len) { } - // Get the maximum block size IdxT get_max_block_size() const { auto data = hip_occupancy_max_blocks_threads( @@ -454,8 +476,7 @@ struct ConcretizerImpl return func_max_threads_per_block; } - // Get a block size that combined with the given grid size is large enough - // to do len work, or 0 if not possible + //! Get a block size when grid size is specified IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -467,16 +488,14 @@ struct ConcretizerImpl } } - // Get a grid size that combined with the given block size is large enough - // to do len work + //! Get a grid size when block size is specified IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const { IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block); return func_blocks_per_device; } - // Get a block size and grid size that combined is large enough - // to do len work + //! Get a block size and grid size when neither is specified auto get_block_and_grid_size_to_fit_len() const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -485,9 +504,7 @@ struct ConcretizerImpl func_blocks_per_device); } - // Get a block size that combined with the given grid size is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a block size when grid size is specified IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const { IdxT func_max_threads_per_block = this->get_max_block_size(); @@ -495,9 +512,7 @@ struct ConcretizerImpl return std::min(func_threads_per_block, func_max_threads_per_block); } - // Get a grid size that combined with the given block size is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a grid size when block size is specified IdxT get_grid_size_to_fit_device(IdxT func_threads_per_block) const { auto data = hip_occupancy_max_blocks( @@ -507,9 +522,7 @@ struct ConcretizerImpl return std::min(func_blocks_per_device, func_max_blocks_per_device); } - // Get a block size and grid size that combined is the smaller of - // the amount need to achieve maximum occupancy on the device or - // the amount needed to do len work + //! Get a block size and grid size when neither is specified auto get_block_and_grid_size_to_fit_device() const { IdxT func_max_threads_per_block = this->get_max_block_size(); From 06b33c6b3edc9de102acc710fe9cab3da6c2b241 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 17 Apr 2024 07:49:52 -0700 Subject: [PATCH 045/108] fixup fraction static_assert --- include/RAJA/util/types.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 011082953d..03cd3b3deb 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -196,7 +196,7 @@ struct SizeList { template struct Fraction { - static_assert(denominator != int_t(0), "denominator may not be zero"); + static_assert(denominator != int_t(0), "denominator must not be zero"); using inverse = Fraction; From 33c92b508ebfeb7546f5ee6164b91476768268d6 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 09:42:18 -0700 Subject: [PATCH 046/108] outmost index is the fastest index in sycl, swap the order around --- include/RAJA/policy/sycl/launch.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index 0dffee6a21..9176444cd4 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -56,13 +56,13 @@ struct LaunchExecute> { // Compute the number of blocks and threads // - const ::sycl::range<3> blockSize(params.threads.value[0], + const ::sycl::range<3> blockSize(params.threads.value[2], params.threads.value[1], - params.threads.value[2]); + params.threads.value[0]); - const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0], + const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2], params.threads.value[1] * params.teams.value[1], - params.threads.value[2] * params.teams.value[2]); + params.threads.value[0] * params.teams.value[0]); // Only launch kernel if we have something to iterate over constexpr size_t zero = 0; @@ -138,13 +138,13 @@ struct LaunchExecute> { // Compute the number of blocks and threads // - const ::sycl::range<3> blockSize(params.threads.value[0], + const ::sycl::range<3> blockSize(params.threads.value[2], params.threads.value[1], - params.threads.value[2]); + params.threads.value[0]); - const ::sycl::range<3> gridSize(params.threads.value[0] * params.teams.value[0], + const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2], params.threads.value[1] * params.teams.value[1], - params.threads.value[2] * params.teams.value[2]); + params.threads.value[0] * params.teams.value[0]); // Only launch kernel if we have something to iterate over constexpr size_t zero = 0; From 31ff12a79118ae0ccdd2d37a535890b490d3332a Mon Sep 17 00:00:00 2001 From: Adrien Bernede <51493078+adrienbernede@users.noreply.github.com> Date: Wed, 17 Apr 2024 19:49:52 +0200 Subject: [PATCH 047/108] Apply changes required by LC (token handling) --- .gitlab-ci.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8c1f7a472e..fb6bc7055c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -75,7 +75,7 @@ stages: include: - local: '.gitlab/custom-jobs-and-variables.yml' - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.3' + ref: 'v2024.04.0' file: 'pipelines/${CI_MACHINE}.yml' - artifact: '${CI_MACHINE}-jobs.yml' job: 'generate-job-lists' @@ -100,9 +100,11 @@ trigger-rajaperf: strategy: depend include: + - project: 'lc-templates/id_tokens' + file: 'id_tokens.yml' # [Optional] checks preliminary to running the actual CI test - project: 'radiuss/radiuss-shared-ci' - ref: 'v2023.12.3' + ref: 'v2024.04.0' file: 'utilities/preliminary-ignore-draft-pr.yml' # pipelines subscribed by the project - local: '.gitlab/subscribed-pipelines.yml' From 3b424476a0a8b2e08fe6f327087ecfffba0b7df2 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 17 Apr 2024 10:54:12 -0700 Subject: [PATCH 048/108] swap ordering of index as sycl uses a c-style convection --- .../RAJA_test-launch-direct-teams-threads-1D-execpol.hpp | 4 ++-- .../RAJA_test-launch-direct-teams-threads-3D-execpol.hpp | 8 ++++---- test/include/RAJA_test-launch-execpol.hpp | 2 +- .../RAJA_test-launch-loop-teams-threads-1D-execpol.hpp | 4 ++-- .../RAJA_test-launch-loop-teams-threads-3D-execpol.hpp | 8 ++++---- test/include/RAJA_test-launch-runtime-execpol.hpp | 8 ++++---- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp index 5b5dfdbebf..7179e48fdc 100644 --- a/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp +++ b/test/include/RAJA_test-launch-direct-teams-threads-1D-execpol.hpp @@ -81,8 +81,8 @@ using Hip_launch_policies = camp::list; using sycl_direct_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list; diff --git a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp index 38bc4c8bb0..f84823e414 100644 --- a/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp +++ b/test/include/RAJA_test-launch-direct-teams-threads-3D-execpol.hpp @@ -100,12 +100,12 @@ using Hip_launch_policies = camp::list; using sycl_direct_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //slowest RAJA::LoopPolicy, - RAJA::LoopPolicy, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //fastest + RAJA::LoopPolicy, RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list; diff --git a/test/include/RAJA_test-launch-execpol.hpp b/test/include/RAJA_test-launch-execpol.hpp index 9961cd0741..fea90a8305 100644 --- a/test/include/RAJA_test-launch-execpol.hpp +++ b/test/include/RAJA_test-launch-execpol.hpp @@ -68,7 +68,7 @@ using Hip_launch_policies = camp::list< using sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy>; + RAJA::LoopPolicy>; using Sycl_launch_policies = camp::list< sycl_policies diff --git a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp index 9e5779853c..6173fc6ffa 100644 --- a/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp +++ b/test/include/RAJA_test-launch-loop-teams-threads-1D-execpol.hpp @@ -75,8 +75,8 @@ using Hip_launch_policies = camp::list< #if defined(RAJA_ENABLE_SYCL) using sycl_loop_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list< diff --git a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp index 9d217757b2..d703216a13 100644 --- a/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp +++ b/test/include/RAJA_test-launch-loop-teams-threads-3D-execpol.hpp @@ -95,12 +95,12 @@ using Hip_launch_policies = camp::list< #if defined(RAJA_ENABLE_SYCL) using sycl_loop_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //slowest index RAJA::LoopPolicy, - RAJA::LoopPolicy, - RAJA::LoopPolicy, + RAJA::LoopPolicy, //fastest index + RAJA::LoopPolicy, RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy >; using Sycl_launch_policies = camp::list< diff --git a/test/include/RAJA_test-launch-runtime-execpol.hpp b/test/include/RAJA_test-launch-runtime-execpol.hpp index bec07358e6..fa2b39f761 100644 --- a/test/include/RAJA_test-launch-runtime-execpol.hpp +++ b/test/include/RAJA_test-launch-runtime-execpol.hpp @@ -52,8 +52,8 @@ using Sequential_launch_policies = camp::list; using seq_sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using Sequential_launch_policies = camp::list; @@ -110,8 +110,8 @@ using OpenMP_launch_policies = camp::list; using omp_sycl_policies = camp::list< RAJA::LaunchPolicy>, - RAJA::LoopPolicy, - RAJA::LoopPolicy + RAJA::LoopPolicy, + RAJA::LoopPolicy >; using OpenMP_launch_policies = camp::list; From 27ec80d4c44e1c409519a9e23d5c1eb23a14c6bd Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 11:12:05 -0700 Subject: [PATCH 049/108] Apply suggestions from code review to docs Co-authored-by: Rich Hornung --- docs/sphinx/user_guide/cook_book.rst | 3 +-- docs/sphinx/user_guide/cook_book/reduction.rst | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book.rst b/docs/sphinx/user_guide/cook_book.rst index 44c89c3d51..91494f3674 100644 --- a/docs/sphinx/user_guide/cook_book.rst +++ b/docs/sphinx/user_guide/cook_book.rst @@ -14,8 +14,7 @@ RAJA Cook Book The following sections show common use case patterns and the recommended RAJA features and policies to use with them. They are intended -for users to copy and paste into their code and provide guidance on -which policy to use with each backend to get good performance. +to provide users with complete beyond usage examples beyond what can be found in other parts of the RAJA User Guide. In particular, the examples and discussion provide guidance on RAJA execution policy selection to improve performance of user application codes. .. toctree:: :maxdepth: 2 diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index 5c17e3a626..e8925ee019 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -12,7 +12,7 @@ Cooking with Reductions ======================= -Please see the following section for more info on RAJA reductions: +Please see the following section for overview discussion about RAJA reductions: * :ref:`feat-reductions-label`. @@ -50,10 +50,10 @@ The results of these operations will yield the following values: RAJA uses policy types to specify how things are implemented. -The forall execution policy specifies how the loop is run in the forall. -For example ``RAJA::seq_exec`` runs a c-style for loop. The -``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a cuda kernel with -256 threads per block and other cuda kernel launch parameters, like the +The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied. +For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The +``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with +256 threads per block and other CUDA kernel launch parameters, like the number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; @@ -67,7 +67,7 @@ The reduction policy specifies how the reduction is done and must match the execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction and can only be used with sequential execution policies. The ``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given -data type, and can only be used with cuda execution policies.:: +data type, and can only be used with cuda execution policies. Similarly for other RAJA execution back-ends, such as HIP and OpenMP. Here are example RAJA reduction policies whose names are indicative of which execution policies they work with:: using reduce_policy = RAJA::seq_reduce; // using reduce_policy = RAJA::omp_reduce; From 2bd50afd92ccdf7fd3ed114f18e9446f3b735249 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 11:13:05 -0700 Subject: [PATCH 050/108] Apply suggestions from code review Co-authored-by: Rich Hornung --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 3554873a08..3b95b8e153 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -429,7 +429,7 @@ policies have the prefix ``hip_``. thread warp. ========================================= ============= ======================================= -When a cuda/hip policy leaves parameters like the block size and/or grid size +When a CUDA or HIP policy leaves parameters like the block size and/or grid size unspecified a concretizer object is used to decide those parameters. The following concretizers are available to use in the ``cuda/hip_exec_occ_custom`` policies: From c8df5e552cd3ba17d482fdbf33c75f045daf4a0c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 11:19:46 -0700 Subject: [PATCH 051/108] Apply suggestions from code review Co-authored-by: Rich Hornung --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 4 ++-- include/RAJA/policy/hip/MemUtils_HIP.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 54a8a7e008..95dbd4bbba 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -290,7 +290,7 @@ cudaDeviceProp get_device_prop() return prop; } -//! Get a cached copy of the device properties +//! Get a copy of the device properties, this copy is cached on first use to speedup later calls RAJA_INLINE cudaDeviceProp& device_prop() { @@ -421,7 +421,7 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, ****************************************************************************** * * \brief Concretizer Implementation that chooses block size and/or grid - * size when they has not been specified at compile time. + * size when one or both has not been specified at compile time. * * \tparam IdxT Index type to use for integer calculations. * \tparam Concretizer Class that determines the max number of blocks to use diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index bfb07bc569..af2a39c191 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -292,7 +292,7 @@ hipDeviceProp_t get_device_prop() return prop; } -//! Get a cached copy of the device properties +//! Get a copy of the device properties, this copy is cached on first use to speedup later calls RAJA_INLINE hipDeviceProp_t& device_prop() { @@ -437,7 +437,7 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, ****************************************************************************** * * \brief Concretizer Implementation that chooses block size and/or grid - * size when they has not been specified at compile time. + * size when one or both has not been specified at compile time. * * \tparam IdxT Index type to use for integer calculations. * \tparam Concretizer Class that determines the max number of blocks to use From a03f647cbdee0c1171b572f44bf8bb6ceb09e841 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 14:59:43 -0700 Subject: [PATCH 052/108] Put initialization in occupancy cacl data structs --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 60 ++++++++-------------- include/RAJA/policy/hip/MemUtils_HIP.hpp | 55 ++++++++------------ 2 files changed, 42 insertions(+), 73 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 95dbd4bbba..4e85f948e8 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -294,25 +294,27 @@ cudaDeviceProp get_device_prop() RAJA_INLINE cudaDeviceProp& device_prop() { - static cudaDeviceProp prop = get_device_prop(); + static thread_local cudaDeviceProp prop = get_device_prop(); return prop; } +static constexpr int cuda_occupancy_uninitialized_int = -1; +static constexpr size_t cuda_occupancy_uninitialized_size_t = + std::numeric_limits::max(); + //! Struct with the maximum theoretical occupancy of the device struct CudaFixedMaxBlocksData { - int device_sm_per_device; - int device_max_threads_per_sm; + int device_sm_per_device = cuda::device_prop().multiProcessorCount; + int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor; }; //! Get the maximum theoretical occupancy of the device RAJA_INLINE CudaFixedMaxBlocksData cuda_max_blocks() { - static thread_local CudaFixedMaxBlocksData data { - cuda::device_prop().multiProcessorCount, - cuda::device_prop().maxThreadsPerMultiProcessor }; + static thread_local CudaFixedMaxBlocksData data; return data; } @@ -320,9 +322,9 @@ CudaFixedMaxBlocksData cuda_max_blocks() //! Struct with the maximum occupancy of a kernel in simple terms struct CudaOccMaxBlocksThreadsData { - size_t func_dynamic_shmem_per_block; - int func_max_blocks_per_device; - int func_max_threads_per_block; + size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t; + int func_max_blocks_per_device = cuda_occupancy_uninitialized_int; + int func_max_threads_per_block = cuda_occupancy_uninitialized_int; }; //! Get the maximum occupancy of a kernel with unknown threads per block @@ -331,33 +333,26 @@ RAJA_INLINE CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func, size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized_int = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksThreadsData data { - uninitialized_size_t, - uninitialized_int, - uninitialized_int }; + static thread_local CudaOccMaxBlocksThreadsData data; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + cudaErrchk(cudaOccupancyMaxPotentialBlockSize( &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); - data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; - } return data; } //! Struct with the maximum occupancy of a kernel in specific terms -struct CudaOccMaxBlocksData +struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData { - size_t func_dynamic_shmem_per_block; - int func_threads_per_block; - int device_sm_per_device; - int device_max_threads_per_sm; - int func_max_blocks_per_sm; + size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t; + int func_threads_per_block = cuda_occupancy_uninitialized_int; + int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int; }; //! Get the maximum occupancy of a kernel with compile time threads per block @@ -366,18 +361,12 @@ RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block) { - static constexpr int uninitialized_int = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksData data { - uninitialized_size_t, - func_threads_per_block, - cuda::device_prop().multiProcessorCount, - cuda::device_prop().maxThreadsPerMultiProcessor, - uninitialized_int }; + static thread_local CudaOccMaxBlocksData data; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block)); @@ -393,14 +382,7 @@ RAJA_INLINE CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static constexpr int uninitialized_int = -1; - static constexpr size_t uninitialized_size_t = std::numeric_limits::max(); - static thread_local CudaOccMaxBlocksData data { - uninitialized_size_t, - uninitialized_int, - cuda::device_prop().multiProcessorCount, - cuda::device_prop().maxThreadsPerMultiProcessor, - uninitialized_int }; + static thread_local CudaOccMaxBlocksData data; if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || data.func_threads_per_block != func_threads_per_block ) { diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index af2a39c191..82b7bfc633 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -296,25 +296,27 @@ hipDeviceProp_t get_device_prop() RAJA_INLINE hipDeviceProp_t& device_prop() { - static hipDeviceProp_t prop = get_device_prop(); + static thread_local hipDeviceProp_t prop = get_device_prop(); return prop; } +static constexpr int hip_occupancy_uninitialized_int = -1; +static constexpr size_t hip_occupancy_uninitialized_size_t = + std::numeric_limits::max(); + //! Struct with the maximum theoretical occupancy of the device struct HipFixedMaxBlocksData { - int device_sm_per_device; - int device_max_threads_per_sm; + int device_sm_per_device = hip::device_prop().multiProcessorCount; + int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor; }; //! Get the maximum theoretical occupancy of the device RAJA_INLINE HipFixedMaxBlocksData hip_max_blocks() { - static thread_local HipFixedMaxBlocksData data { - hip::device_prop().multiProcessorCount, - hip::device_prop().maxThreadsPerMultiProcessor }; + static thread_local HipFixedMaxBlocksData data; return data; } @@ -322,9 +324,9 @@ HipFixedMaxBlocksData hip_max_blocks() //! Struct with the maximum occupancy of a kernel in simple terms struct HipOccMaxBlocksThreadsData { - size_t func_dynamic_shmem_per_block; - int func_max_blocks_per_device; - int func_max_threads_per_block; + size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t; + int func_max_blocks_per_device = hip_occupancy_uninitialized_int; + int func_max_threads_per_block = hip_occupancy_uninitialized_int; }; //! Get the maximum occupancy of a kernel with unknown threads per block @@ -333,13 +335,12 @@ RAJA_INLINE HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, size_t func_dynamic_shmem_per_block) { - static thread_local HipOccMaxBlocksThreadsData data { - std::numeric_limits::max(), - -1, - -1 }; + static thread_local HipOccMaxBlocksThreadsData data; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { + data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxPotentialBlockSize( &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block)); @@ -350,21 +351,17 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func, data.func_max_threads_per_block = 1024; #endif - data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; - } return data; } //! Struct with the maximum occupancy of a kernel in specific terms -struct HipOccMaxBlocksData +struct HipOccMaxBlocksData : HipFixedMaxBlocksData { - size_t func_dynamic_shmem_per_block; - int func_threads_per_block; - int device_sm_per_device; - int device_max_threads_per_sm; - int func_max_blocks_per_sm; + size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t; + int func_threads_per_block = hip_occupancy_uninitialized_int; + int func_max_blocks_per_sm = hip_occupancy_uninitialized_int; }; //! Get the maximum occupancy of a kernel with compile time threads per block @@ -373,16 +370,12 @@ RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block) { - static thread_local HipOccMaxBlocksData data { - std::numeric_limits::max(), - func_threads_per_block, - hip::device_prop().multiProcessorCount, - hip::device_prop().maxThreadsPerMultiProcessor, - -1 }; + static thread_local HipOccMaxBlocksData data; if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) { data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block; + data.func_threads_per_block = func_threads_per_block; #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor( @@ -393,7 +386,6 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 } #endif - } return data; @@ -405,12 +397,7 @@ RAJA_INLINE HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block, int func_threads_per_block) { - static thread_local HipOccMaxBlocksData data { - std::numeric_limits::max(), - -1, - hip::device_prop().multiProcessorCount, - hip::device_prop().maxThreadsPerMultiProcessor, - -1 }; + static thread_local HipOccMaxBlocksData data; if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block || data.func_threads_per_block != func_threads_per_block ) { From 36fe701474273235f10b8d2375c01981adb9728e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 24 Apr 2024 14:53:58 -0700 Subject: [PATCH 053/108] Add a for_each impl with overload for camp::list of types --- include/RAJA/RAJA.hpp | 1 + include/RAJA/util/for_each.hpp | 95 +++++++++++ test/unit/algorithm/CMakeLists.txt | 5 + .../test-algorithm-util-for_each.cpp | 150 ++++++++++++++++++ 4 files changed, 251 insertions(+) create mode 100644 include/RAJA/util/for_each.hpp create mode 100644 test/unit/algorithm/test-algorithm-util-for_each.cpp diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index f41aad477b..32522a1f0d 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -35,6 +35,7 @@ #include "RAJA/util/types.hpp" #include "RAJA/util/plugins.hpp" #include "RAJA/util/Registry.hpp" +#include "RAJA/util/for_each.hpp" // diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp new file mode 100644 index 0000000000..c95f40da35 --- /dev/null +++ b/include/RAJA/util/for_each.hpp @@ -0,0 +1,95 @@ +/*! +****************************************************************************** +* +* \file +* +* \brief Header file providing RAJA for_each templates. +* +****************************************************************************** +*/ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_for_each_HPP +#define RAJA_util_for_each_HPP + +#include "RAJA/config.hpp" + +#include +#include + +#include "camp/list.hpp" + +#include "RAJA/pattern/detail/algorithm.hpp" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/types.hpp" + +namespace RAJA +{ + +namespace detail +{ + +// runtime loop applying func to each element in the range in order +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func) +{ + for (; begin != end; ++begin) { + func(*begin); + } + + return func; +} + +// compile time expansion applying func to a each type in the list in order +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_type(camp::list const&, UnaryFunc func) +{ + // braced init lists are evaluated in order + int seq_unused_array[] = {(func(Ts{}), 0)...}; + RAJA_UNUSED_VAR(seq_unused_array); + + return func; +} + +} // namespace detail + + +/*! + \brief Apply func to all the elements in the given range in order + using a sequential for loop in O(N) operations and O(1) extra memory + see https://en.cppreference.com/w/cpp/algorithm/for_each +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + for_each(Container&& c, UnaryFunc func) +{ + using std::begin; + using std::end; + + return detail::for_each(begin(c), end(c), std::move(func)); +} + +/*! + \brief Apply func to each type in the given list in order + using a compile-time expansion in O(N) operations and O(1) extra memory +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +UnaryFunc for_each_type(camp::list const& c, UnaryFunc func) +{ + return detail::for_each_type(c, std::move(func)); +} + +} // namespace RAJA + +#endif diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt index 856e4519b6..0142a94ed3 100644 --- a/test/unit/algorithm/CMakeLists.txt +++ b/test/unit/algorithm/CMakeLists.txt @@ -88,3 +88,8 @@ unset( SORT_BACKENDS ) unset( SEQUENTIAL_UTIL_SORTS ) unset( CUDA_UTIL_SORTS ) unset( HIP_UTIL_SORTS ) + + +raja_add_test( + NAME test-algorithm-util-for_each + SOURCES test-algorithm-util-for_each.cpp) diff --git a/test/unit/algorithm/test-algorithm-util-for_each.cpp b/test/unit/algorithm/test-algorithm-util-for_each.cpp new file mode 100644 index 0000000000..db918ad234 --- /dev/null +++ b/test/unit/algorithm/test-algorithm-util-for_each.cpp @@ -0,0 +1,150 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Source file containing unit tests for for_each +/// + +#include "RAJA_test-base.hpp" + +#include "RAJA_unit-test-types.hpp" + +#include "camp/resource.hpp" + +#include +#include +#include + +template +class ForEachUnitTest : public ::testing::Test {}; + +TYPED_TEST_SUITE(ForEachUnitTest, UnitIndexTypes); + + +TYPED_TEST(ForEachUnitTest, EmptyRange) +{ + std::vector numbers; + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam& number) { + number += 1; + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 0); + ASSERT_EQ(numbers.size(), 0); +} + +TYPED_TEST(ForEachUnitTest, VectorRange) +{ + std::vector numbers; + for (TypeParam i = 0; i < 13; ++i) { + numbers.push_back(i); + } + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam& number) { + copies.push_back(number); + number += 1; + }); + + ASSERT_EQ(copies.size(), 13); + for (TypeParam i = 0; i < 13; ++i) { + ASSERT_EQ(numbers[i], copies[i]+1); + } +} + +TYPED_TEST(ForEachUnitTest, RajaSpanRange) +{ + std::vector numbers; + for (TypeParam i = 0; i < 11; ++i) { + numbers.push_back(i); + } + + std::vector copies; + RAJA::for_each(RAJA::make_span(numbers.data(), 11), [&](TypeParam& number) { + copies.push_back(number); + number += 1; + }); + + ASSERT_EQ(copies.size(), 11); + for (TypeParam i = 0; i < 11; ++i) { + ASSERT_EQ(numbers[i], copies[i]+1); + } +} + +TYPED_TEST(ForEachUnitTest, SetRange) +{ + std::set numbers; + for (TypeParam i = 0; i < 6; ++i) { + numbers.insert(i); + } + + std::vector copies; + RAJA::for_each(numbers, [&](TypeParam const& number) { + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 6); + for (TypeParam i = 0; i < 6; ++i) { + ASSERT_EQ(i, copies[i]); + ASSERT_EQ(numbers.count(i), 1); + } +} + + +TYPED_TEST(ForEachUnitTest, EmptyTypeList) +{ + using numbers = camp::list<>; + + std::vector copies; + RAJA::for_each_type(numbers{}, [&](auto number) { + copies.push_back(number); + }); + + ASSERT_EQ(copies.size(), 0); +} + + +template < typename T, T val > +T get_num(std::integral_constant) +{ + return val; +} + +template < typename TypeParam, + std::enable_if_t::value>* = nullptr > +void run_int_type_test() +{ + using numbers = camp::list, + std::integral_constant, + std::integral_constant, + std::integral_constant, + std::integral_constant>; + + std::vector copies; + RAJA::for_each_type(numbers{}, [&](auto number) { + copies.push_back(get_num(number)); + }); + + ASSERT_EQ(copies.size(), 5); + for (TypeParam i = 0; i < 5; ++i) { + ASSERT_EQ(i, copies[i]); + } +} +/// +template < typename TypeParam, + std::enable_if_t::value>* = nullptr > +void run_int_type_test() +{ + // ignore non-ints +} + +TYPED_TEST(ForEachUnitTest, IntTypeList) +{ + run_int_type_test(); +} From 76760fe478e587909282f62242ffa5faba22c5eb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 27 Apr 2024 14:18:10 -0700 Subject: [PATCH 054/108] Fix zero sized array --- include/RAJA/util/for_each.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp index c95f40da35..b279ec29ff 100644 --- a/include/RAJA/util/for_each.hpp +++ b/include/RAJA/util/for_each.hpp @@ -54,7 +54,7 @@ RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list const&, UnaryFunc func) { // braced init lists are evaluated in order - int seq_unused_array[] = {(func(Ts{}), 0)...}; + int seq_unused_array[] = {0, (func(Ts{}), 0)...}; RAJA_UNUSED_VAR(seq_unused_array); return func; From b360da9e9b1111be6f266cfcaed62ff7accc8deb Mon Sep 17 00:00:00 2001 From: artv3 Date: Tue, 30 Apr 2024 10:36:28 -0700 Subject: [PATCH 055/108] add note about thread ordering --- docs/sphinx/user_guide/feature/policies.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 3b95b8e153..5affa42203 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -525,9 +525,24 @@ write more explicit policies. unspecified so a runtime number of threads is used, but grid_size is ignored so blocks are ignored when getting indices. + GPU Policies for SYCL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +.. note:: SYCL uses C++-style ordering in which the right + most index corresponds to having unit stride. + In a three-dimensional compute grid this means + that dimension 2 has the unit stride while + dimension 0 has the longest stride. This is + important to note as the ordering is reverse + compared to the CUDA and HIP programming models. + + When using RAJA launch thread and team configuration + follows CUDA and HIP programming models and is always + configured in three-dimensions. This means that dimension + 2 always exist and should be used as one would the + x dimension for CUDA and HIP. + ======================================== ============= ============================== SYCL Execution Policies Works with Brief description ======================================== ============= ============================== From bad63908c57d83005b93ef7bbe8c6b5e8f5c874e Mon Sep 17 00:00:00 2001 From: artv3 Date: Tue, 30 Apr 2024 10:39:08 -0700 Subject: [PATCH 056/108] more docs --- docs/sphinx/user_guide/feature/policies.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 5affa42203..107bc27af2 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -524,7 +524,6 @@ write more explicit policies. ignored. For example in cuda_thread_x_direct block_size is unspecified so a runtime number of threads is used, but grid_size is ignored so blocks are ignored when getting indices. - GPU Policies for SYCL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -535,7 +534,9 @@ GPU Policies for SYCL that dimension 2 has the unit stride while dimension 0 has the longest stride. This is important to note as the ordering is reverse - compared to the CUDA and HIP programming models. + compared to the CUDA and HIP programming models. + CUDA and HIP employ a x/y/z ordering in which + dimension x has the unit stride. When using RAJA launch thread and team configuration follows CUDA and HIP programming models and is always From 5ddd86a8c631b78e4bac500458db1d394c3b3af0 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 30 Apr 2024 10:56:15 -0700 Subject: [PATCH 057/108] Update docs/sphinx/user_guide/feature/policies.rst Co-authored-by: Rich Hornung --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 107bc27af2..b3ac763cc9 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -541,7 +541,7 @@ GPU Policies for SYCL When using RAJA launch thread and team configuration follows CUDA and HIP programming models and is always configured in three-dimensions. This means that dimension - 2 always exist and should be used as one would the + 2 always exists and should be used as one would use the x dimension for CUDA and HIP. ======================================== ============= ============================== From aef818c91dee2a78dd12aeedc18caca52f6724ed Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 30 Apr 2024 10:56:25 -0700 Subject: [PATCH 058/108] Update docs/sphinx/user_guide/feature/policies.rst Co-authored-by: Rich Hornung --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index b3ac763cc9..aad065cb16 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -538,7 +538,7 @@ GPU Policies for SYCL CUDA and HIP employ a x/y/z ordering in which dimension x has the unit stride. - When using RAJA launch thread and team configuration + When using RAJA::launch, thread and team configuration follows CUDA and HIP programming models and is always configured in three-dimensions. This means that dimension 2 always exists and should be used as one would use the From 2b0864a40053b338edc02f118bcf65c745bfde46 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 1 Apr 2024 10:26:55 -0700 Subject: [PATCH 059/108] Add options to cuda/hip reduction policies Add replication and atomic_stride to cuda/hip reduction policies. They currently default to 0 which lets RAJA choose values for these parameters automatically. --- include/RAJA/policy/cuda/policy.hpp | 4 +- include/RAJA/policy/cuda/reduce.hpp | 236 +++++++++++++++++----------- include/RAJA/policy/hip/policy.hpp | 4 +- include/RAJA/policy/hip/reduce.hpp | 234 ++++++++++++++++----------- 4 files changed, 285 insertions(+), 193 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index e7a72b2be7..c9efc45566 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -238,7 +238,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template +template struct cuda_reduce_base : public RAJA:: make_policy_pattern_launch_platform_t -RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, - T identity, - TempIterator device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, + T identity, + TempIterator device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; int blockId = blockIdx.x + gridDim.x * blockIdx.y + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int wrap_around = (numBlocks / replication) - + ((replicationId < (numBlocks % replication)) ? 0 : 1); T temp = block_reduce(val, identity); // one thread per block writes to device_mem - bool lastBlock = false; + bool isLastBlock = false; if (threadId == 0) { device_mem.set(blockId, temp); // ensure write visible to all threadblocks __threadfence(); // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); + isLastBlock = (old_count == wrap_around); } // returns non-zero value if any thread passes in a non-zero value - lastBlock = __syncthreads_or(lastBlock); + isLastBlock = __syncthreads_or(isLastBlock); // last block accumulates values from device_mem - if (lastBlock) { + if (isLastBlock) { temp = identity; - for (int i = threadId; i < numBlocks; i += numThreads) { + for (int i = replicationId + threadId*replication; + i < numBlocks; + i += numThreads*replication) { Combiner{}(temp, device_mem.get(i)); } @@ -523,7 +530,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, } } - return lastBlock && threadId == 0; + return (isLastBlock && threadId == 0) ? replicationId : replication; } namespace expt { @@ -653,64 +660,71 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val, - T identity, - T* device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, + T identity, + T* device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - unsigned int wrap_around = numBlocks + 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - // one thread in first block initializes device_mem + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int wrap_around = numBlocks / replication + + ((replicationId < (numBlocks % replication)) ? 2 : 1); + + // the first block of each replication initializes device_mem if (threadId == 0) { - unsigned int old_val = ::atomicCAS(device_count, 0u, 1u); + unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[0] = identity; + device_mem[atomicOffset] = identity; // consider making this atomic __threadfence(); - ::atomicAdd(device_count, 1u); + ::atomicAdd(&device_count[atomicOffset], 1u); } } T temp = block_reduce(val, identity); - // one thread per block performs atomic on device_mem - bool lastBlock = false; + // one thread per block performs an atomic on device_mem + bool isLastBlock = false; if (threadId == 0) { - // thread waits for device_mem to be initialized - while (static_cast(device_count)[0] < 2u) + // wait for device_mem to be initialized + while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; __threadfence(); - RAJA::reduce::cuda::atomic{}(device_mem[0], temp); + RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); __threadfence(); // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); + isLastBlock = (old_count == wrap_around); - // last block gets value from device_mem - if (lastBlock) { - val = device_mem[0]; + // the last block for each replication gets the value from device_mem + if (isLastBlock) { + val = device_mem[atomicOffset]; // consider making this atomic } } - return lastBlock; + return isLastBlock ? replicationId : replication; } } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T value; + T values[replication]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -785,7 +799,7 @@ class PinnedTally return ret; } - T& operator*() { return m_n->value; } + auto operator*() -> T(&)[replication] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -822,7 +836,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - T* new_value(::RAJA::resources::Cuda res) + auto new_value(::RAJA::resources::Cuda res) -> T(&)[replication] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -842,7 +856,7 @@ class PinnedTally Node* n = cuda::pinned_mempool_type::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; - return &n->value; + return n->values; } //! synchronize all resources used @@ -889,7 +903,8 @@ class PinnedTally //! Reduction data for Cuda Offload -- stores value, host pointer, and device //! pointer -template +template struct Reduce_Data { mutable T value; @@ -898,7 +913,7 @@ struct Reduce_Data { RAJA::detail::SoAPtr device; bool own_device_ptr; - Reduce_Data() : Reduce_Data(T(), T()){}; + Reduce_Data() : Reduce_Data(T(), T()){} /*! \brief create from a default value and offload information * @@ -928,7 +943,13 @@ struct Reduce_Data { //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[replication]) + { + for (size_t r = 0; r < replication; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -936,8 +957,10 @@ struct Reduce_Data { { T temp = value; - if (impl::grid_reduce(temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -949,9 +972,10 @@ struct Reduce_Data { if (act) { cuda_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); + size_t numSlots = ((numBlocks + replication - 1) / replication) * replication; + device.allocate(numSlots); device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -974,7 +998,8 @@ struct Reduce_Data { //! Reduction data for Cuda Offload -- stores value, host pointer -template +template struct ReduceAtomic_Data { mutable T value; @@ -1008,7 +1033,13 @@ struct ReduceAtomic_Data { //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[replication]) + { + for (size_t r = 0; r < replication; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -1016,9 +1047,10 @@ struct ReduceAtomic_Data { { T temp = value; - if (impl::grid_reduce_atomic( - temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_atomic( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -1028,9 +1060,9 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); + device = device_mempool_type::getInstance().template malloc(replication*atomic_stride); device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -1053,7 +1085,8 @@ struct ReduceAtomic_Data { }; //! Cuda Reduction entity -- generalize on reduction, and type -template +template class Reduce { public: @@ -1063,7 +1096,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new PinnedTally}, val(init_val, identity_) { } @@ -1090,9 +1123,8 @@ class Reduce #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) if (parent) { if (val.setupForDevice()) { - tally_or_val_ptr.val_ptr = - tally_or_val_ptr.list->new_value(currentResource()); - val.init_grid_val(tally_or_val_ptr.val_ptr); + tally_or_val_ptr.val_ptr = val.init_grid_vals( + tally_or_val_ptr.list->new_value(currentResource())); parent = nullptr; } } @@ -1137,7 +1169,10 @@ class Reduce if (n != end) { tally_or_val_ptr.list->synchronize_resources(); for (; n != end; ++n) { - Combiner{}(val.value, *n); + T(&values)[replication] = *n; + for (size_t r = 0; r < replication; ++r) { + Combiner{}(val.value, values[r]); + } } tally_or_val_ptr.list->free_list(); } @@ -1160,12 +1195,21 @@ class Reduce private: const Reduce* parent; + static constexpr size_t replication = (t_replication > 0) + ? t_replication + : 1; + static constexpr size_t atomic_stride = (t_atomic_stride > 0) + ? t_atomic_stride + : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + //! union to hold either pointer to PinnedTally or poiter to value // only use list before setup for device and only use val_ptr after union tally_u { - PinnedTally* list; + PinnedTally* list; T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; + constexpr tally_u(PinnedTally* l) : list(l){}; constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; }; @@ -1174,8 +1218,8 @@ class Reduce //! cuda reduction data storage class and folding algorithm using reduce_data_type = typename std::conditional< maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available::value, - cuda::ReduceAtomic_Data, - cuda::Reduce_Data>::type; + cuda::ReduceAtomic_Data, + cuda::Reduce_Data>::type; //! storage for reduction data reduce_data_type val; @@ -1184,13 +1228,13 @@ class Reduce } // end namespace cuda //! specialization of ReduceSum for cuda_reduce -template -class ReduceSum, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceSum, T> + : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1202,13 +1246,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for cuda_reduce -template -class ReduceBitOr, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceBitOr, T> + : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1220,13 +1264,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for cuda_reduce -template -class ReduceBitAnd, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceBitAnd, T> + : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1238,13 +1282,13 @@ class ReduceBitAnd, T> }; //! specialization of ReduceMin for cuda_reduce -template -class ReduceMin, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceMin, T> + : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1256,13 +1300,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for cuda_reduce -template -class ReduceMax, T> - : public cuda::Reduce, T, maybe_atomic> +template +class ReduceMax, T> + : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = cuda::Reduce, T, maybe_atomic>; + using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1274,18 +1318,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for cuda_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public cuda::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + maybe_atomic, replication, atomic_stride> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1324,18 +1368,18 @@ class ReduceMinLoc, T, IndexType> }; //! specialization of ReduceMaxLoc for cuda_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public cuda:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + maybe_atomic, replication, atomic_stride> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 65c87ff203..c814bec83d 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -229,7 +229,8 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template +template struct hip_reduce_base : public RAJA:: make_policy_pattern_launch_platform_t -RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, - T identity, - TempIterator device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, + T identity, + TempIterator device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; int numThreads = blockDim.x * blockDim.y * blockDim.z; - unsigned int wrap_around = numBlocks - 1; int blockId = blockIdx.x + gridDim.x * blockIdx.y + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int wrap_around = (numBlocks / replication) - + ((replicationId < (numBlocks % replication)) ? 0 : 1); T temp = block_reduce(val, identity); // one thread per block writes to device_mem - __shared__ bool lastBlock; + __shared__ bool isLastBlock; if (threadId == 0) { device_mem.set(blockId, temp); // ensure write visible to all threadblocks __threadfence(); // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around) ? 1: 0; + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); + isLastBlock = (old_count == wrap_around); } // returns non-zero value if any thread passes in a non-zero value __syncthreads(); // last block accumulates values from device_mem - if (lastBlock) { + if (isLastBlock) { temp = identity; - for (int i = threadId; i < numBlocks; i += numThreads) { + for (int i = replicationId + threadId*replication; + i < numBlocks; + i += numThreads*replication) { Combiner{}(temp, device_mem.get(i)); } @@ -396,7 +403,7 @@ RAJA_DEVICE RAJA_INLINE bool grid_reduce(T& val, } } - return lastBlock && threadId == 0; + return (isLastBlock && threadId == 0) ? replicationId : replication; } namespace expt { @@ -526,64 +533,71 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template -RAJA_DEVICE RAJA_INLINE bool grid_reduce_atomic(T& val, - T identity, - T* device_mem, - unsigned int* device_count) +template +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, + T identity, + T* device_mem, + unsigned int* device_count) { - int numBlocks = gridDim.x * gridDim.y * gridDim.z; - unsigned int wrap_around = numBlocks + 1; - int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; - // one thread in first block initializes device_mem + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + int numBlocks = gridDim.x * gridDim.y * gridDim.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + unsigned int wrap_around = numBlocks / replication + + ((replicationId < (numBlocks % replication)) ? 2 : 1); + + // the first block of each replication initializes device_mem if (threadId == 0) { - unsigned int old_val = ::atomicCAS(device_count, 0u, 1u); + unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[0] = identity; + device_mem[atomicOffset] = identity; // consider making this atomic __threadfence(); - ::atomicAdd(device_count, 1u); + ::atomicAdd(&device_count[atomicOffset], 1u); } } T temp = block_reduce(val, identity); - // one thread per block performs atomic on device_mem - bool lastBlock = false; + // one thread per block performs an atomic on device_mem + bool isLastBlock = false; if (threadId == 0) { - // thread waits for device_mem to be initialized - while (static_cast(device_count)[0] < 2u) + // wait for device_mem to be initialized + while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; __threadfence(); - RAJA::reduce::hip::atomic{}(device_mem[0], temp); + RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); __threadfence(); // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(device_count, wrap_around); - lastBlock = (old_count == wrap_around); + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); + isLastBlock = (old_count == wrap_around); - // last block gets value from device_mem - if (lastBlock) { - val = device_mem[0]; + // the last block for each replication gets the value from device_mem + if (isLastBlock) { + val = device_mem[atomicOffset]; // consider making this atomic } } - return lastBlock; + return isLastBlock ? replicationId : replication; } } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T value; + T values[replication]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -658,7 +672,7 @@ class PinnedTally return ret; } - T& operator*() { return m_n->value; } + auto operator*() -> T(&)[replication] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -695,7 +709,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - T* new_value(::RAJA::resources::Hip res) + auto new_value(::RAJA::resources::Hip res) -> T(&)[replication] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -715,7 +729,7 @@ class PinnedTally Node* n = hip::pinned_mempool_type::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; - return &n->value; + return n->values; } //! synchronize all resources used @@ -762,7 +776,8 @@ class PinnedTally //! Reduction data for Hip Offload -- stores value, host pointer, and device //! pointer -template +template struct Reduce_Data { mutable T value; @@ -801,7 +816,13 @@ struct Reduce_Data { //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[replication]) + { + for (size_t r = 0; r < replication; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -809,8 +830,10 @@ struct Reduce_Data { { T temp = value; - if (impl::grid_reduce(temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -822,9 +845,10 @@ struct Reduce_Data { if (act) { hip_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - device.allocate(numBlocks); + size_t numSlots = ((numBlocks + replication - 1) / replication) * replication; + device.allocate(numSlots); device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -847,7 +871,8 @@ struct Reduce_Data { //! Reduction data for Hip Offload -- stores value, host pointer -template +template struct ReduceAtomic_Data { mutable T value; @@ -856,7 +881,7 @@ struct ReduceAtomic_Data { T* device; bool own_device_ptr; - ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}; + ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){} ReduceAtomic_Data(T initValue, T identity_) : value{initValue}, @@ -881,7 +906,13 @@ struct ReduceAtomic_Data { //! initialize output to identity to ensure never read // uninitialized memory - void init_grid_val(T* output) { *output = identity; } + T* init_grid_vals(T(&output)[replication]) + { + for (size_t r = 0; r < replication; ++r) { + output[r] = identity; + } + return &output[0]; + } //! reduce values in grid to single value, store in output RAJA_DEVICE @@ -889,9 +920,10 @@ struct ReduceAtomic_Data { { T temp = value; - if (impl::grid_reduce_atomic( - temp, identity, device, device_count)) { - *output = temp; + size_t replicationId = impl::grid_reduce_atomic( + temp, identity, device, device_count); + if (replicationId != replication) { + output[replicationId] = temp; } } @@ -901,9 +933,9 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(1); + device = device_mempool_type::getInstance().template malloc(replication*atomic_stride); device_count = device_zeroed_mempool_type::getInstance() - .template malloc(1); + .template malloc(replication*atomic_stride); own_device_ptr = true; } return act; @@ -926,7 +958,8 @@ struct ReduceAtomic_Data { }; //! Hip Reduction entity -- generalize on reduction, and type -template +template class Reduce { public: @@ -936,7 +969,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new PinnedTally}, val(init_val, identity_) { } @@ -963,9 +996,8 @@ class Reduce #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) if (parent) { if (val.setupForDevice()) { - tally_or_val_ptr.val_ptr = - tally_or_val_ptr.list->new_value(currentResource()); - val.init_grid_val(tally_or_val_ptr.val_ptr); + tally_or_val_ptr.val_ptr = val.init_grid_vals( + tally_or_val_ptr.list->new_value(currentResource())); parent = nullptr; } } @@ -1010,7 +1042,10 @@ class Reduce if (n != end) { tally_or_val_ptr.list->synchronize_resources(); for (; n != end; ++n) { - Combiner{}(val.value, *n); + T(&values)[replication] = *n; + for (size_t r = 0; r < replication; ++r) { + Combiner{}(val.value, values[r]); + } } tally_or_val_ptr.list->free_list(); } @@ -1033,12 +1068,21 @@ class Reduce private: const Reduce* parent; + static constexpr size_t replication = (t_replication > 0) + ? t_replication + : 32; + static constexpr size_t atomic_stride = (t_atomic_stride > 0) + ? t_atomic_stride + : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + //! union to hold either pointer to PinnedTally or poiter to value // only use list before setup for device and only use val_ptr after union tally_u { - PinnedTally* list; + PinnedTally* list; T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; + constexpr tally_u(PinnedTally* l) : list(l){}; constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; }; @@ -1047,8 +1091,8 @@ class Reduce //! hip reduction data storage class and folding algorithm using reduce_data_type = typename std::conditional< maybe_atomic && RAJA::reduce::hip::hip_atomic_available::value, - hip::ReduceAtomic_Data, - hip::Reduce_Data>::type; + hip::ReduceAtomic_Data, + hip::Reduce_Data>::type; //! storage for reduction data reduce_data_type val; @@ -1057,13 +1101,13 @@ class Reduce } // end namespace hip //! specialization of ReduceSum for hip_reduce -template -class ReduceSum, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceSum, T> + : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1075,13 +1119,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for hip_reduce -template -class ReduceBitOr, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceBitOr, T> + : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1093,13 +1137,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for hip_reduce -template -class ReduceBitAnd, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceBitAnd, T> + : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1111,13 +1155,13 @@ class ReduceBitAnd, T> }; //! specialization of ReduceMin for hip_reduce -template -class ReduceMin, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceMin, T> + : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1129,13 +1173,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for hip_reduce -template -class ReduceMax, T> - : public hip::Reduce, T, maybe_atomic> +template +class ReduceMax, T> + : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> { public: - using Base = hip::Reduce, T, maybe_atomic>; + using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1147,18 +1191,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for hip_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public hip::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + maybe_atomic, replication, atomic_stride> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1197,18 +1241,18 @@ class ReduceMinLoc, T, IndexType> }; //! specialization of ReduceMaxLoc for hip_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public hip:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic> + maybe_atomic, replication, atomic_stride> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer From 18d29cef93f02bd4b02e58c1ddbb646d8af9a2c6 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 1 Apr 2024 11:43:18 -0700 Subject: [PATCH 060/108] Reorder non-atomic grid reduce device storage This makes the final block for each replication have coalesced reads as it combines the slots --- include/RAJA/policy/cuda/reduce.hpp | 33 +++++++++++++++++------------ include/RAJA/policy/hip/reduce.hpp | 33 +++++++++++++++++------------ 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 22fd7348a0..0dcb1ee3ae 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -490,23 +490,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, (gridDim.x * gridDim.y) * blockIdx.z; int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int replicationId = (blockId%replication); - int atomicOffset = replicationId*atomic_stride; + int replicationId = blockId % replication; + int slotId = blockId / replication; + + int maxNumSlots = (numBlocks + replication - 1) / replication; + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); - unsigned int wrap_around = (numBlocks / replication) - - ((replicationId < (numBlocks % replication)) ? 0 : 1); + int atomicOffset = replicationId * atomic_stride; + int beginSlots = replicationId * maxNumSlots; + int blockSlot = beginSlots + slotId; T temp = block_reduce(val, identity); // one thread per block writes to device_mem bool isLastBlock = false; if (threadId == 0) { - device_mem.set(blockId, temp); + device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); - isLastBlock = (old_count == wrap_around); + // increment counter, (wraps back to zero if old count == (numSlots-1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); + isLastBlock = (old_count == (numSlots-1)); } // returns non-zero value if any thread passes in a non-zero value @@ -516,10 +521,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, if (isLastBlock) { temp = identity; - for (int i = replicationId + threadId*replication; - i < numBlocks; - i += numThreads*replication) { - Combiner{}(temp, device_mem.get(i)); + for (unsigned int i = threadId; + i < numSlots; + i += numThreads) { + Combiner{}(temp, device_mem.get(beginSlots + i)); } temp = block_reduce(temp, identity); @@ -972,8 +977,8 @@ struct Reduce_Data { if (act) { cuda_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - size_t numSlots = ((numBlocks + replication - 1) / replication) * replication; - device.allocate(numSlots); + size_t maxNumSlots = (numBlocks + replication - 1) / replication; + device.allocate(maxNumSlots*replication); device_count = device_zeroed_mempool_type::getInstance() .template malloc(replication*atomic_stride); own_device_ptr = true; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 4889c7f598..fd79e67600 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -363,23 +363,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, (gridDim.x * gridDim.y) * blockIdx.z; int numBlocks = gridDim.x * gridDim.y * gridDim.z; - int replicationId = (blockId%replication); - int atomicOffset = replicationId*atomic_stride; + int replicationId = blockId % replication; + int slotId = blockId / replication; + + int maxNumSlots = (numBlocks + replication - 1) / replication; + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); - unsigned int wrap_around = (numBlocks / replication) - - ((replicationId < (numBlocks % replication)) ? 0 : 1); + int atomicOffset = replicationId * atomic_stride; + int beginSlots = replicationId * maxNumSlots; + int blockSlot = beginSlots + slotId; T temp = block_reduce(val, identity); // one thread per block writes to device_mem __shared__ bool isLastBlock; if (threadId == 0) { - device_mem.set(blockId, temp); + device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); - isLastBlock = (old_count == wrap_around); + // increment counter, (wraps back to zero if old count == (numSlots-1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); + isLastBlock = (old_count == (numSlots-1)); } // returns non-zero value if any thread passes in a non-zero value @@ -389,10 +394,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, if (isLastBlock) { temp = identity; - for (int i = replicationId + threadId*replication; - i < numBlocks; - i += numThreads*replication) { - Combiner{}(temp, device_mem.get(i)); + for (unsigned int i = threadId; + i < numSlots; + i += numThreads) { + Combiner{}(temp, device_mem.get(beginSlots + i)); } temp = block_reduce(temp, identity); @@ -845,8 +850,8 @@ struct Reduce_Data { if (act) { hip_dim_t gridDim = currentGridDim(); size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; - size_t numSlots = ((numBlocks + replication - 1) / replication) * replication; - device.allocate(numSlots); + size_t maxNumSlots = (numBlocks + replication - 1) / replication; + device.allocate(maxNumSlots*replication); device_count = device_zeroed_mempool_type::getInstance() .template malloc(replication*atomic_stride); own_device_ptr = true; From 76b24505fa5f6f028047435e318420c0c9b01f7d Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 1 Apr 2024 13:02:07 -0700 Subject: [PATCH 061/108] Add special case for small numbers of blocks Now the algorithm avoids atomics and extra block reductions if they are unnecessary --- include/RAJA/policy/cuda/reduce.hpp | 25 ++++++++++++++++++++----- include/RAJA/policy/hip/reduce.hpp | 25 ++++++++++++++++++++----- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 0dcb1ee3ae..3704eb4303 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -503,6 +503,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, T temp = block_reduce(val, identity); + if (numSlots <= 1u) { + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } + // one thread per block writes to device_mem bool isLastBlock = false; if (threadId == 0) { @@ -681,8 +688,16 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, int replicationId = (blockId%replication); int atomicOffset = replicationId*atomic_stride; - unsigned int wrap_around = numBlocks / replication + - ((replicationId < (numBlocks % replication)) ? 2 : 1); + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); + + if (numSlots <= 1u) { + T temp = block_reduce(val, identity); + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } // the first block of each replication initializes device_mem if (threadId == 0) { @@ -705,9 +720,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, __threadfence(); RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); - isLastBlock = (old_count == wrap_around); + // increment counter, (wraps back to zero if old count == (numSlots+1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); + isLastBlock = (old_count == (numSlots+1)); // the last block for each replication gets the value from device_mem if (isLastBlock) { diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index fd79e67600..187de47ee2 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -376,6 +376,13 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, T temp = block_reduce(val, identity); + if (numSlots <= 1u) { + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } + // one thread per block writes to device_mem __shared__ bool isLastBlock; if (threadId == 0) { @@ -554,8 +561,16 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, int replicationId = (blockId%replication); int atomicOffset = replicationId*atomic_stride; - unsigned int wrap_around = numBlocks / replication + - ((replicationId < (numBlocks % replication)) ? 2 : 1); + unsigned int numSlots = (numBlocks / replication) + + ((replicationId < (numBlocks % replication)) ? 1 : 0); + + if (numSlots <= 1u) { + T temp = block_reduce(val, identity); + if (threadId == 0) { + val = temp; + } + return (threadId == 0) ? replicationId : replication; + } // the first block of each replication initializes device_mem if (threadId == 0) { @@ -578,9 +593,9 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, __threadfence(); RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); __threadfence(); - // increment counter, (wraps back to zero if old count == wrap_around) - unsigned int old_count = ::atomicInc(&device_count[atomicOffset], wrap_around); - isLastBlock = (old_count == wrap_around); + // increment counter, (wraps back to zero if old count == (numSlots+1)) + unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); + isLastBlock = (old_count == (numSlots+1)); // the last block for each replication gets the value from device_mem if (isLastBlock) { From ae50f0895d3dcbfa29dd15a02e98502f278804c1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 6 Apr 2024 23:57:37 -0700 Subject: [PATCH 062/108] Add device pinned allocators These are useful for operations that need to access memory on the host and on the device, but the device accesses are more performance critical. --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 25 ++++++++++++++++++++++ include/RAJA/policy/hip/MemUtils_HIP.hpp | 20 +++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 4e85f948e8..5a66aff20e 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -111,9 +111,34 @@ struct DeviceZeroedAllocator { } }; +//! Allocator for device pinned memory for use in basic_mempool +struct DevicePinnedAllocator { + + // returns a valid pointer on success, nullptr on failure + void* malloc(size_t nbytes) + { + int device; + cudaErrchk(cudaGetDevice(&device)); + void* ptr; + cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal)); + cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device)); + cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId)); + + return ptr; + } + + // returns true on success, false on failure + bool free(void* ptr) + { + cudaErrchk(cudaFree(ptr)); + return true; + } +}; + using device_mempool_type = basic_mempool::MemPool; using device_zeroed_mempool_type = basic_mempool::MemPool; +using device_pinned_mempool_type = basic_mempool::MemPool; using pinned_mempool_type = basic_mempool::MemPool; namespace detail diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 82b7bfc633..63a8c9911c 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -113,9 +113,29 @@ struct DeviceZeroedAllocator { } }; +//! Allocator for device pinned memory for use in basic_mempool +struct DevicePinnedAllocator { + + // returns a valid pointer on success, nullptr on failure + void* malloc(size_t nbytes) + { + void* ptr; + hipErrchk(hipMalloc(&ptr, nbytes)); + return ptr; + } + + // returns true on success, false on failure + bool free(void* ptr) + { + hipErrchk(hipFree(ptr)); + return true; + } +}; + using device_mempool_type = basic_mempool::MemPool; using device_zeroed_mempool_type = basic_mempool::MemPool; +using device_pinned_mempool_type = basic_mempool::MemPool; using pinned_mempool_type = basic_mempool::MemPool; namespace detail From e90045709f4a6f1f54f8130e27d0b5cb9a9471ff Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sat, 6 Apr 2024 23:59:02 -0700 Subject: [PATCH 063/108] Add an accessor template arg to SoAPtr --- include/RAJA/util/SoAPtr.hpp | 62 ++++++++++++++++++++++++++++++------ include/RAJA/util/types.hpp | 26 +++++++++++++++ 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp index 616b8d21d4..00a2fce111 100644 --- a/include/RAJA/util/SoAPtr.hpp +++ b/include/RAJA/util/SoAPtr.hpp @@ -20,8 +20,11 @@ #include "RAJA/config.hpp" +#include + // for RAJA::reduce::detail::ValueLoc #include "RAJA/pattern/detail/reduce.hpp" +#include "RAJA/util/types.hpp" namespace RAJA { @@ -38,18 +41,37 @@ namespace detail */ template > + RAJA::basic_mempool::generic_allocator>, + typename accessor = DefaultAccessor > class SoAPtr { - using value_type = T; + template < typename, typename, typename > + friend class SoAPtr; // fiend other instantiations of this class public: + using value_type = T; + + template < typename rhs_accessor > + using rebind_accessor = SoAPtr; + SoAPtr() = default; + SoAPtr(SoAPtr const&) = default; + SoAPtr(SoAPtr &&) = default; + SoAPtr& operator=(SoAPtr const&) = default; + SoAPtr& operator=(SoAPtr &&) = default; + explicit SoAPtr(size_t size) : mem(mempool::getInstance().template malloc(size)) { } + template < typename rhs_accessor, + std::enable_if_t::value>* = nullptr > + RAJA_HOST_DEVICE + explicit SoAPtr(SoAPtr const& rhs) + : mem(rhs.mem) + { } + SoAPtr& allocate(size_t size) { mem = mempool::getInstance().template malloc(size); @@ -65,8 +87,8 @@ class SoAPtr RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; } - RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; } - RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; } + RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); } + RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); } private: value_type* mem = nullptr; @@ -75,21 +97,41 @@ class SoAPtr /*! * @brief Specialization for RAJA::reduce::detail::ValueLoc. */ -template -class SoAPtr, mempool> +template +class SoAPtr, mempool, accessor> { - using value_type = RAJA::reduce::detail::ValueLoc; using first_type = T; using second_type = IndexType; + template < typename, typename, typename > + friend class SoAPtr; // fiend other instantiations of this class + public: + using value_type = RAJA::reduce::detail::ValueLoc; + + template < typename rhs_accessor > + using rebind_accessor = SoAPtr; + SoAPtr() = default; + SoAPtr(SoAPtr const&) = default; + SoAPtr(SoAPtr &&) = default; + SoAPtr& operator=(SoAPtr const&) = default; + SoAPtr& operator=(SoAPtr &&) = default; + explicit SoAPtr(size_t size) : mem(mempool::getInstance().template malloc(size)), mem_idx(mempool::getInstance().template malloc(size)) { } + template < typename rhs_accessor, + std::enable_if_t::value>* = nullptr > + RAJA_HOST_DEVICE + explicit SoAPtr(SoAPtr const& rhs) + : mem(rhs.mem) + , mem_idx(rhs.mem_idx) + { } + SoAPtr& allocate(size_t size) { mem = mempool::getInstance().template malloc(size); @@ -110,12 +152,12 @@ class SoAPtr, mempool> RAJA_HOST_DEVICE value_type get(size_t i) const { - return value_type(mem[i], mem_idx[i]); + return value_type(accessor::get(mem, i), accessor::get(mem_idx, i)); } RAJA_HOST_DEVICE void set(size_t i, value_type val) { - mem[i] = val; - mem_idx[i] = val.getLoc(); + accessor::set(mem, i, first_type(val)); + accessor::set(mem_idx, i, val.getLoc()); } private: diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 03cd3b3deb..f19d9947b6 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -30,6 +30,9 @@ #include "camp/helpers.hpp" +#include "RAJA/util/macros.hpp" + + namespace RAJA { @@ -863,6 +866,29 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr; #endif + +namespace detail { + +/*! + * \brief Abstracts access to memory using normal memory accesses. + */ +struct DefaultAccessor +{ + template < typename T > + static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i) + { + return ptr[i]; + } + + template < typename T > + static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val) + { + ptr[i] = val; + } +}; + +} // namespace detail + } // namespace RAJA #endif // closing endif for header file include guard From 03734897f8de7d8fd60ecda9a51181bf66235e93 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 7 Apr 2024 00:05:45 -0700 Subject: [PATCH 064/108] Add more cuda/hip reducer tunings Add option to initalize reducers with atomics on the host. Add option to to use algorithm that avoids device scope fences. Split cuda/hip reduce header into reduce and intrinsics headers. --- docs/Licenses/rocprim-license.txt | 21 + include/RAJA/policy/cuda/intrinsics.hpp | 452 +++++++++++++++ include/RAJA/policy/cuda/policy.hpp | 44 +- include/RAJA/policy/cuda/reduce.hpp | 736 +++++++++--------------- include/RAJA/policy/hip/intrinsics.hpp | 346 +++++++++++ include/RAJA/policy/hip/policy.hpp | 49 +- include/RAJA/policy/hip/reduce.hpp | 589 ++++++++----------- include/RAJA/util/macros.hpp | 2 + include/RAJA/util/types.hpp | 69 +++ test/include/RAJA_test-reducepol.hpp | 12 +- 10 files changed, 1487 insertions(+), 833 deletions(-) create mode 100644 docs/Licenses/rocprim-license.txt create mode 100644 include/RAJA/policy/cuda/intrinsics.hpp create mode 100644 include/RAJA/policy/hip/intrinsics.hpp diff --git a/docs/Licenses/rocprim-license.txt b/docs/Licenses/rocprim-license.txt new file mode 100644 index 0000000000..976ca2abb3 --- /dev/null +++ b/docs/Licenses/rocprim-license.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp new file mode 100644 index 0000000000..053d7ab50e --- /dev/null +++ b/include/RAJA/policy/cuda/intrinsics.hpp @@ -0,0 +1,452 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA intrinsics templates for CUDA execution. + * + * These methods should work on any platform that supports + * CUDA devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_cuda_intrinsics_HPP +#define RAJA_cuda_intrinsics_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_CUDA) + +#include + +#include + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/SoAArray.hpp" +#include "RAJA/util/types.hpp" + +#include "RAJA/policy/cuda/policy.hpp" + + +namespace RAJA +{ + +namespace cuda +{ + +namespace impl +{ + +/*! + * \brief Abstracts access to memory using normal memory accesses. + */ +struct AccessorWithFences : RAJA::detail::DefaultAccessor +{ + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + +/*! + ****************************************************************************** + * + * \brief Abstracts access to memory using atomic memory accesses. + * + * \Note Memory access through this class does not guarantee safe access to a + * value that is accessed concurrently by other threads as it may split + * memory operations into multiple atomic instructions. + * \Note Fences used through this class only guarantee ordering, they do not + * guarantee visiblity of non-atomic memory operations as it may not + * actually flush the cache. + * + ****************************************************************************** + */ +struct AccessorAvoidingFences +{ + // cuda has 32 and 64 bit atomics + static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); + static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long); + + template < typename T > + static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + auto ptr = const_cast(reinterpret_cast(in_ptr + idx)); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = atomicAdd(&ptr[i], integer_type(0)); + } + + return u.get_value(); + } + + template < typename T > + static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + u.set_value(val); + auto ptr = reinterpret_cast(in_ptr + idx); + + for (size_t i = 0; i < u.array_size(); ++i) { + atomicExch(&ptr[i], u.array[i]); + } + } + + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + + +// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits +constexpr size_t min_shfl_int_type_size = sizeof(unsigned int); +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 +constexpr size_t max_shfl_int_type_size = sizeof(unsigned long long); +#else +constexpr size_t max_shfl_int_type_size = sizeof(unsigned int); +#endif + +/*! + ****************************************************************************** + * + * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. + * + * \Note Returns an undefined value if src lane is inactive (divergence). + * Returns this lane's value if src lane is out of bounds or has exited. + * + ****************************************************************************** + */ +template +RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask); +#else + u.array[i] = ::__shfl_xor(u.array[i], laneMask); +#endif + } + return u.get_value(); +} + +template +RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane); +#else + u.array[i] = ::__shfl(u.array[i], srcLane); +#endif + } + return u.get_value(); +} + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync(unsigned int var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE long shfl_xor_sync(long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync(unsigned long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync(long long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync(unsigned long long var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE double shfl_xor_sync(double var, int laneMask) +{ + return ::__shfl_xor_sync(0xffffffffu, var, laneMask); +} + +#else + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +#endif + + +#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync(unsigned int var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE long shfl_sync(long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync(unsigned long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE long long shfl_sync(long long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync(unsigned long long var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE double shfl_sync(double var, int srcLane) +{ + return ::__shfl_sync(0xffffffffu, var, srcLane); +} + +#else + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +#endif + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = val; + + if (numThreads % policy::cuda::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + return temp; +} + +/*! + * Allreduce values in a warp. + * + * + * This does a butterfly pattern leaving each lane with the full reduction + * + */ +template +RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) +{ + T temp = val; + + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = __shfl_xor_sync(0xffffffff, temp, i); + Combiner{}(temp, rhs); + } + + return temp; +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int warpId = threadId % policy::cuda::WARP_SIZE; + int warpNum = threadId / policy::cuda::WARP_SIZE; + + T temp = val; + + if (numThreads % policy::cuda::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + // reduce per warp values + if (numThreads > policy::cuda::WARP_SIZE) { + + static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE, + "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + + // Need to separate declaration and initialization for clang-cuda + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + + // Partial placement new: Should call new(tmpsd) here but recasting memory + // to avoid calling constructor/destructor in shared memory. + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); + + // write per warp values to shared memory + if (warpId == 0) { + sd->set(warpNum, temp); + } + + __syncthreads(); + + if (warpNum == 0) { + + // read per warp values + if (warpId * policy::cuda::WARP_SIZE < numThreads) { + temp = sd->get(warpId); + } else { + temp = identity; + } + + for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + } + + __syncthreads(); + } + + return temp; +} + +} // end namespace impl + +} // end namespace cuda + +} // end namespace RAJA + +#endif // closing endif for RAJA_ENABLE_CUDA guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index c9efc45566..b3b8ae04d1 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -159,6 +159,17 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer } }; +template < size_t t_replication, size_t t_atomic_stride, + bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host > +struct ReduceTuning +{ + static constexpr size_t replication = t_replication; + static constexpr size_t atomic_stride = t_atomic_stride; + static constexpr bool maybe_atomic = t_maybe_atomic; + static constexpr bool avoid_fences = t_avoid_fences; + static constexpr bool init_on_host = t_init_on_host; +}; + } // namespace cuda namespace policy @@ -238,9 +249,8 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template -struct cuda_reduce_base +template < typename tuning > +struct cuda_reduce_policy : public RAJA:: make_policy_pattern_launch_platform_t; -using cuda_reduce = cuda_reduce_base; +template < bool maybe_atomic, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified, + bool init_on_host = false, + bool avoid_fences = false > +using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning< + replication, atomic_stride, + maybe_atomic, init_on_host, avoid_fences> >; + +using cuda_reduce_with_fences = cuda_reduce_base; + +using cuda_reduce_avoid_fences = cuda_reduce_base; + +using cuda_reduce_atomic_with_fences = cuda_reduce_base; + +using cuda_reduce_atomic_avoid_fences = cuda_reduce_base; + +using cuda_reduce_atomic_host_init = cuda_reduce_base; + +using cuda_reduce = cuda_reduce_with_fences; -using cuda_reduce_atomic = cuda_reduce_base; +using cuda_reduce_atomic = cuda_reduce_atomic_host_init; // Policy for RAJA::statement::Reduce that reduces threads in a block @@ -1142,6 +1171,11 @@ using policy::cuda::cuda_atomic; using policy::cuda::cuda_atomic_explicit; // policies usable with reducers +using policy::cuda::cuda_reduce_with_fences; +using policy::cuda::cuda_reduce_avoid_fences; +using policy::cuda::cuda_reduce_atomic_with_fences; +using policy::cuda::cuda_reduce_atomic_avoid_fences; +using policy::cuda::cuda_reduce_atomic_host_init; using policy::cuda::cuda_reduce_base; using policy::cuda::cuda_reduce; using policy::cuda::cuda_reduce_atomic; diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 3704eb4303..ccb310d2f9 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -25,6 +25,8 @@ #if defined(RAJA_ENABLE_CUDA) +#include + #include #include "RAJA/util/macros.hpp" @@ -38,6 +40,7 @@ #include "RAJA/pattern/reduce.hpp" #include "RAJA/policy/cuda/MemUtils_CUDA.hpp" +#include "RAJA/policy/cuda/intrinsics.hpp" #if defined(RAJA_ENABLE_DESUL_ATOMICS) #include "RAJA/policy/desul/atomic.hpp" @@ -56,6 +59,7 @@ namespace reduce namespace cuda { + //! atomic operator version of Combiner object template struct atomic; @@ -84,6 +88,22 @@ struct atomic> { } }; +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v); + } +}; + template struct cuda_atomic_available { static constexpr const bool value = @@ -101,387 +121,18 @@ namespace cuda namespace impl { -/*! - * \brief Abstracts T into an equal or greater size array of integers whose - * size is between min_integer_type_size and max_interger_type_size inclusive. - */ -template -union AsIntegerArray { - - static_assert(min_integer_type_size <= max_integer_type_size, - "incompatible min and max integer type size"); - using integer_type = typename std::conditional< - ((alignof(T) >= alignof(long long) && - sizeof(long long) <= max_integer_type_size) || - sizeof(long) < min_integer_type_size), - long long, - typename std::conditional< - ((alignof(T) >= alignof(long) && - sizeof(long) <= max_integer_type_size) || - sizeof(int) < min_integer_type_size), - long, - typename std::conditional< - ((alignof(T) >= alignof(int) && - sizeof(int) <= max_integer_type_size) || - sizeof(short) < min_integer_type_size), - int, - typename std::conditional< - ((alignof(T) >= alignof(short) && - sizeof(short) <= max_integer_type_size) || - sizeof(char) < min_integer_type_size), - short, - typename std::conditional< - ((alignof(T) >= alignof(char) && - sizeof(char) <= max_integer_type_size)), - char, - void>::type>::type>::type>::type>::type; - static_assert(!std::is_same::value, - "could not find a compatible integer type"); - static_assert(sizeof(integer_type) >= min_integer_type_size, - "integer_type smaller than min integer type size"); - static_assert(sizeof(integer_type) <= max_integer_type_size, - "integer_type greater than max integer type size"); - - static constexpr size_t num_integer_type = - (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); - - T value; - integer_type array[num_integer_type]; - - RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){}; - - RAJA_HOST_DEVICE constexpr size_t array_size() const - { - return num_integer_type; - } -}; - -// cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits -constexpr const size_t min_shfl_int_type_size = sizeof(int); -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 -constexpr const size_t max_shfl_int_type_size = sizeof(long long); -#else -constexpr const size_t max_shfl_int_type_size = sizeof(int); -#endif - -/*! - ****************************************************************************** - * - * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. - * - * \Note Returns an undefined value if src lane is inactive (divergence). - * Returns this lane's value if src lane is out of bounds or has exited. - * - ****************************************************************************** - */ -template -RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask); -#else - u.array[i] = ::__shfl_xor(u.array[i], laneMask); -#endif - } - return u.value; -} - -template -RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane); -#else - u.array[i] = ::__shfl(u.array[i], srcLane); -#endif - } - return u.value; -} - -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync(unsigned int var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE long shfl_xor_sync(long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync(unsigned long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync(long long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync(unsigned long long var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE double shfl_xor_sync(double var, int laneMask) -{ - return ::__shfl_xor_sync(0xffffffffu, var, laneMask); -} - -#else - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -#endif - - -#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000 - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync(unsigned int var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE long shfl_sync(long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync(unsigned long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE long long shfl_sync(long long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync(unsigned long long var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE double shfl_sync(double var, int srcLane) -{ - return ::__shfl_sync(0xffffffffu, var, srcLane); -} - -#else - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - -#endif - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) -{ - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = val; - - if (numThreads % policy::cuda::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - return temp; -} - -/*! - * Allreduce values in a warp. - * - * - * This does a butterfly pattern leaving each lane with the full reduction - * - */ -template -RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) -{ - T temp = val; - - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = __shfl_xor_sync(0xffffffff, temp, i); - Combiner{}(temp, rhs); - } - - return temp; -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) -{ - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - int warpId = threadId % policy::cuda::WARP_SIZE; - int warpNum = threadId / policy::cuda::WARP_SIZE; - - T temp = val; - - if (numThreads % policy::cuda::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::cuda::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - // reduce per warp values - if (numThreads > policy::cuda::WARP_SIZE) { - - static_assert(policy::cuda::MAX_WARPS <= policy::cuda::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); - - // Need to separate declaration and initialization for clang-cuda - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; - - // Partial placement new: Should call new(tmpsd) here but recasting memory - // to avoid calling constructor/destructor in shared memory. - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); - - // write per warp values to shared memory - if (warpId == 0) { - sd->set(warpNum, temp); - } - - __syncthreads(); - - if (warpNum == 0) { - - // read per warp values - if (warpId * policy::cuda::WARP_SIZE < numThreads) { - temp = sd->get(warpId); - } else { - temp = identity; - } - - for (int i = 1; i < policy::cuda::MAX_WARPS; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - } - - __syncthreads(); - } - - return temp; -} - - //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, T identity, - TempIterator device_mem, + TempIterator in_device_mem, unsigned int* device_count) { + typename TempIterator::template rebind_accessor device_mem(in_device_mem); + int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; int numThreads = blockDim.x * blockDim.y * blockDim.z; @@ -515,7 +166,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, if (threadId == 0) { device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks - __threadfence(); + Accessor::fence_release(); // increment counter, (wraps back to zero if old count == (numSlots-1)) unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); isLastBlock = (old_count == (numSlots-1)); @@ -527,6 +178,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, // last block accumulates values from device_mem if (isLastBlock) { temp = identity; + Accessor::fence_acquire(); for (unsigned int i = threadId; i < numSlots; @@ -653,6 +305,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red // last block accumulates values from device_mem if (lastBlock) { temp = OP::identity(); + __threadfence(); for (int i = threadId; i < numBlocks; i += numThreads) { temp = OP{}(temp, red.device_mem.get(i)); @@ -672,7 +325,8 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template +template RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, T identity, T* device_mem, @@ -703,8 +357,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, if (threadId == 0) { unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[atomicOffset] = identity; // consider making this atomic - __threadfence(); + Accessor::set(device_mem, atomicOffset, identity); + Accessor::fence_release(); ::atomicAdd(&device_count[atomicOffset], 1u); } } @@ -717,34 +371,58 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, // wait for device_mem to be initialized while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; - __threadfence(); + Accessor::fence_acquire(); RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); - __threadfence(); + Accessor::fence_release(); // increment counter, (wraps back to zero if old count == (numSlots+1)) unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); isLastBlock = (old_count == (numSlots+1)); // the last block for each replication gets the value from device_mem if (isLastBlock) { - val = device_mem[atomicOffset]; // consider making this atomic + Accessor::fence_acquire(); + val = Accessor::get(device_mem, atomicOffset); } } return isLastBlock ? replicationId : replication; } +//! reduce values in block into thread 0 and atomically combines into device_mem +template +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val, + T identity, + T* device_mem) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + T temp = block_reduce(val, identity); + + // one thread per block performs an atomic on device_mem + if (threadId == 0 && temp != identity) { + RAJA::reduce::cuda::atomic{}(device_mem[atomicOffset], temp); + } +} + } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T values[replication]; + T values[num_slots]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -819,7 +497,7 @@ class PinnedTally return ret; } - auto operator*() -> T(&)[replication] { return m_n->values; } + auto operator*() -> T(&)[num_slots] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -856,7 +534,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - auto new_value(::RAJA::resources::Cuda res) -> T(&)[replication] + auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -873,7 +551,7 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = cuda::pinned_mempool_type::getInstance().template malloc(1); + Node* n = mempool::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; return n->values; @@ -896,7 +574,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - cuda::pinned_mempool_type::getInstance().free(n); + mempool::getInstance().free(n); } resource_list = rn->next; free(rn); @@ -923,15 +601,21 @@ class PinnedTally //! Reduction data for Cuda Offload -- stores value, host pointer, and device //! pointer -template -struct Reduce_Data { +struct Reduce_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; - bool own_device_ptr; + RAJA::detail::SoAPtr device; + bool owns_device_pointer; Reduce_Data() : Reduce_Data(T(), T()){} @@ -945,7 +629,7 @@ struct Reduce_Data { identity{identity_}, device_count{nullptr}, device{}, - own_device_ptr{false} + owns_device_pointer{false} { } @@ -955,7 +639,7 @@ struct Reduce_Data { identity{other.identity}, device_count{other.device_count}, device{other.device}, - own_device_ptr{false} + owns_device_pointer{false} { } @@ -963,9 +647,9 @@ struct Reduce_Data { //! initialize output to identity to ensure never read // uninitialized memory - T* init_grid_vals(T(&output)[replication]) + T* init_grid_vals(T(&output)[tally_slots]) { - for (size_t r = 0; r < replication; ++r) { + for (size_t r = 0; r < tally_slots; ++r) { output[r] = identity; } return &output[0]; @@ -977,8 +661,9 @@ struct Reduce_Data { { T temp = value; - size_t replicationId = impl::grid_reduce( - temp, identity, device, device_count); + size_t replicationId = impl::grid_reduce< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); if (replicationId != replication) { output[replicationId] = temp; } @@ -994,9 +679,9 @@ struct Reduce_Data { size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; size_t maxNumSlots = (numBlocks + replication - 1) / replication; device.allocate(maxNumSlots*replication); - device_count = device_zeroed_mempool_type::getInstance() + device_count = count_mempool_type::getInstance() .template malloc(replication*atomic_stride); - own_device_ptr = true; + owns_device_pointer = true; } return act; } @@ -1005,28 +690,114 @@ struct Reduce_Data { // free device pointers bool teardownForDevice() { - bool act = own_device_ptr; + bool act = owns_device_pointer; if (act) { device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; - own_device_ptr = false; + owns_device_pointer = false; } return act; } }; - //! Reduction data for Cuda Offload -- stores value, host pointer template -struct ReduceAtomic_Data { +struct ReduceAtomicInitialized_Data +{ + using tally_mempool_type = device_pinned_mempool_type; + + static constexpr size_t tally_slots = replication * atomic_stride; + + mutable T value; + T identity; + bool is_setup; + bool owns_device_pointer; + + ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){}; + + ReduceAtomicInitialized_Data(T initValue, T identity_) + : value{initValue}, + identity{identity_}, + is_setup{false}, + owns_device_pointer{false} + { + } + + RAJA_HOST_DEVICE + ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other) + : value{other.identity}, + identity{other.identity}, + is_setup{other.is_setup}, + owns_device_pointer{false} + { + } + + ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default; + + //! initialize output to identity to ensure never read + // uninitialized memory + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } + + //! reduce values in grid to single value, store in output + RAJA_DEVICE + void grid_reduce(T* output) + { + T temp = value; + + impl::grid_reduce_atomic_initialized( + temp, identity, output); + } + + //! check and setup for device + // allocate device pointers and get a new result buffer from the pinned tally + bool setupForDevice() + { + bool act = !is_setup && setupReducers(); + if (act) { + is_setup = true; + owns_device_pointer = true; + } + return act; + } + + //! if own resources teardown device setup + // free device pointers + bool teardownForDevice() + { + bool act = owns_device_pointer; + if (act) { + is_setup = false; + owns_device_pointer = false; + } + return act; + } +}; + +//! Reduction data for Cuda Offload -- stores value, host pointer +template +struct ReduceAtomic_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; T* device; - bool own_device_ptr; + bool owns_device_pointer; ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}; @@ -1035,7 +806,7 @@ struct ReduceAtomic_Data { identity{identity_}, device_count{nullptr}, device{nullptr}, - own_device_ptr{false} + owns_device_pointer{false} { } @@ -1045,7 +816,7 @@ struct ReduceAtomic_Data { identity{other.identity}, device_count{other.device_count}, device{other.device}, - own_device_ptr{false} + owns_device_pointer{false} { } @@ -1053,9 +824,9 @@ struct ReduceAtomic_Data { //! initialize output to identity to ensure never read // uninitialized memory - T* init_grid_vals(T(&output)[replication]) + T* init_grid_vals(T(&output)[tally_slots]) { - for (size_t r = 0; r < replication; ++r) { + for (size_t r = 0; r < tally_slots; ++r) { output[r] = identity; } return &output[0]; @@ -1067,8 +838,9 @@ struct ReduceAtomic_Data { { T temp = value; - size_t replicationId = impl::grid_reduce_atomic( - temp, identity, device, device_count); + size_t replicationId = impl::grid_reduce_atomic< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); if (replicationId != replication) { output[replicationId] = temp; } @@ -1080,10 +852,10 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(replication*atomic_stride); - device_count = device_zeroed_mempool_type::getInstance() + device = data_mempool_type::getInstance().template malloc(replication*atomic_stride); + device_count = count_mempool_type::getInstance() .template malloc(replication*atomic_stride); - own_device_ptr = true; + owns_device_pointer = true; } return act; } @@ -1092,23 +864,58 @@ struct ReduceAtomic_Data { // free device pointers bool teardownForDevice() { - bool act = own_device_ptr; + bool act = owns_device_pointer; if (act) { - device_mempool_type::getInstance().free(device); + data_mempool_type::getInstance().free(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; - own_device_ptr = false; + owns_device_pointer = false; } return act; } }; //! Cuda Reduction entity -- generalize on reduction, and type -template +template class Reduce { + static constexpr size_t replication = (tuning::replication > 0) + ? tuning::replication + : 1; + static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) + ? tuning::atomic_stride + : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + + static constexpr bool use_atomic = tuning::maybe_atomic && + RAJA::reduce::cuda::cuda_atomic_available::value; + + using Accessor = std::conditional_t; + + //! cuda reduction data storage class and folding algorithm + using reduce_data_type = std::conditional_t, + cuda::ReduceAtomic_Data>, + cuda::Reduce_Data>; + + static constexpr size_t tally_slots = reduce_data_type::tally_slots; + + using TallyType = PinnedTally; + + //! union to hold either pointer to PinnedTally or pointer to value + // only use list before setup for device and only use val_ptr after + union tally_u { + TallyType* list; + T* val_ptr; + constexpr tally_u(TallyType* l) : list(l){}; + constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; + }; + public: Reduce() : Reduce(T(), Combiner::identity()) {} @@ -1116,7 +923,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new TallyType}, val(init_val, identity_) { } @@ -1189,8 +996,8 @@ class Reduce if (n != end) { tally_or_val_ptr.list->synchronize_resources(); for (; n != end; ++n) { - T(&values)[replication] = *n; - for (size_t r = 0; r < replication; ++r) { + T(&values)[tally_slots] = *n; + for (size_t r = 0; r < tally_slots; ++r) { Combiner{}(val.value, values[r]); } } @@ -1214,47 +1021,20 @@ class Reduce private: const Reduce* parent; - - static constexpr size_t replication = (t_replication > 0) - ? t_replication - : 1; - static constexpr size_t atomic_stride = (t_atomic_stride > 0) - ? t_atomic_stride - : ((policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) - ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) - : 1); - - //! union to hold either pointer to PinnedTally or poiter to value - // only use list before setup for device and only use val_ptr after - union tally_u { - PinnedTally* list; - T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; - constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; - }; - tally_u tally_or_val_ptr; - - //! cuda reduction data storage class and folding algorithm - using reduce_data_type = typename std::conditional< - maybe_atomic && RAJA::reduce::cuda::cuda_atomic_available::value, - cuda::ReduceAtomic_Data, - cuda::Reduce_Data>::type; - - //! storage for reduction data reduce_data_type val; }; } // end namespace cuda //! specialization of ReduceSum for cuda_reduce -template -class ReduceSum, T> - : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceSum, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1266,13 +1046,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for cuda_reduce -template -class ReduceBitOr, T> - : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceBitOr, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1284,13 +1064,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for cuda_reduce -template -class ReduceBitAnd, T> - : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceBitAnd, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1302,13 +1082,13 @@ class ReduceBitAnd, T }; //! specialization of ReduceMin for cuda_reduce -template -class ReduceMin, T> - : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceMin, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1320,13 +1100,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for cuda_reduce -template -class ReduceMax, T> - : public cuda::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceMax, T> + : public cuda::Reduce, T, tuning> { public: - using Base = cuda::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = cuda::Reduce, T, tuning>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1338,18 +1118,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for cuda_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public cuda::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic, replication, atomic_stride> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1388,18 +1168,18 @@ class ReduceMinLoc, T }; //! specialization of ReduceMaxLoc for cuda_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public cuda:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic, replication, atomic_stride> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = cuda::Reduce; + using Base = cuda::Reduce; using Base::Base; //! constructor requires a default value for the reducer diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp new file mode 100644 index 0000000000..374a66323e --- /dev/null +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -0,0 +1,346 @@ +/*! + ****************************************************************************** + * + * \file + * + * \brief Header file containing RAJA intrinsics templates for HIP execution. + * + * These methods should work on any platform that supports + * HIP devices. + * + ****************************************************************************** + */ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_hip_intrinsics_HPP +#define RAJA_hip_intrinsics_HPP + +#include "RAJA/config.hpp" + +#if defined(RAJA_ENABLE_HIP) + +#include + +#include + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/SoAArray.hpp" +#include "RAJA/util/types.hpp" + +#include "RAJA/policy/hip/policy.hpp" + + +namespace RAJA +{ + +namespace hip +{ + +namespace impl +{ + +/*! + * \brief Abstracts access to memory using normal memory accesses. + */ +struct AccessorWithFences : RAJA::detail::DefaultAccessor +{ + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { + __threadfence(); + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { + __threadfence(); + } +}; + +/*! + ****************************************************************************** + * + * \brief Abstracts access to memory using atomic memory accesses. + * + * \Note Memory access through this class does not guarantee safe access to a + * value that is accessed concurrently by other threads as it may split + * memory operations into multiple atomic instructions. + * \Note Fences used through this class only guarantee ordering, they do not + * guarantee visiblity of non-atomic memory operations as it may not + * actually flush the cache. + * + ****************************************************************************** + */ +struct AccessorAvoidingFences +{ + // hip has 32 and 64 bit atomics + static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); + static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long); + + template < typename T > + static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + auto ptr = const_cast(reinterpret_cast(in_ptr + idx)); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(RAJA_USE_HIP_INTRINSICS) + u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + u.array[i] = atomicAdd(&ptr[i], integer_type(0)); +#endif + } + + return u.get_value(); + } + + template < typename T > + static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val) + { + using ArrayType = RAJA::detail::AsIntegerArray; + using integer_type = typename ArrayType::integer_type; + + ArrayType u; + u.set_value(val); + auto ptr = reinterpret_cast(in_ptr + idx); + + for (size_t i = 0; i < u.array_size(); ++i) { +#if defined(RAJA_USE_HIP_INTRINSICS) + __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); +#else + atomicExch(&ptr[i], u.array[i]); +#endif + } + } + + static RAJA_DEVICE RAJA_INLINE void fence_acquire() + { +#if defined(RAJA_USE_HIP_INTRINSICS) + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); +#else + __threadfence(); +#endif + } + + static RAJA_DEVICE RAJA_INLINE void fence_release() + { +#if defined(RAJA_USE_HIP_INTRINSICS) + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); + // Wait until all vmem operations complete (s_waitcnt vmcnt(0)) + __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8)); +#else + __threadfence(); +#endif + } +}; + + +// hip only has shfl primitives for 32 bits +constexpr size_t min_shfl_int_type_size = sizeof(unsigned int); +constexpr size_t max_shfl_int_type_size = sizeof(unsigned int); + +/*! + ****************************************************************************** + * + * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. + * + * \Note Returns an undefined value if src lane is inactive (divergence). + * Returns this lane's value if src lane is out of bounds or has exited. + * + ****************************************************************************** + */ +template +RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = ::__shfl_xor(u.array[i], laneMask); + } + return u.get_value(); +} + +template +RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) +{ + RAJA::detail::AsIntegerArray u; + u.set_value(var); + + for (size_t i = 0; i < u.array_size(); ++i) { + u.array[i] = ::__shfl(u.array[i], srcLane); + } + return u.get_value(); +} + + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) +{ + return ::__shfl_xor(var, laneMask); +} + +template <> +RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + +template <> +RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) +{ + return ::__shfl(var, srcLane); +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + T temp = val; + + if (numThreads % policy::hip::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + return temp; +} + +/*! + * Allreduce values in a warp. + * + * + * This does a butterfly pattern leaving each lane with the full reduction + * + */ +template +RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) +{ + T temp = val; + + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + return temp; +} + + +//! reduce values in block into thread 0 +template +RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) +{ + int numThreads = blockDim.x * blockDim.y * blockDim.z; + + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int warpId = threadId % policy::hip::WARP_SIZE; + int warpNum = threadId / policy::hip::WARP_SIZE; + + T temp = val; + + if (numThreads % policy::hip::WARP_SIZE == 0) { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + + } else { + + // reduce each warp + for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { + int srcLane = threadId ^ i; + T rhs = shfl_sync(temp, srcLane); + // only add from threads that exist (don't double count own value) + if (srcLane < numThreads) { + Combiner{}(temp, rhs); + } + } + } + + // reduce per warp values + if (numThreads > policy::hip::WARP_SIZE) { + + static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE, + "Max Warps must be less than or equal to Warp Size for this algorithm to work"); + + __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; + RAJA::detail::SoAArray* sd = + reinterpret_cast *>(tmpsd); + + // write per warp values to shared memory + if (warpId == 0) { + sd->set(warpNum, temp); + } + + __syncthreads(); + + if (warpNum == 0) { + + // read per warp values + if (warpId * policy::hip::WARP_SIZE < numThreads) { + temp = sd->get(warpId); + } else { + temp = identity; + } + + for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) { + T rhs = shfl_xor_sync(temp, i); + Combiner{}(temp, rhs); + } + } + + __syncthreads(); + } + + return temp; +} + +} // end namespace impl + +} // end namespace hip + +} // end namespace RAJA + +#endif // closing endif for RAJA_ENABLE_HIP guard + +#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index c814bec83d..6a53e91177 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -154,6 +154,17 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer } }; +template < size_t t_replication, size_t t_atomic_stride, + bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host > +struct ReduceTuning +{ + static constexpr size_t replication = t_replication; + static constexpr size_t atomic_stride = t_atomic_stride; + static constexpr bool maybe_atomic = t_maybe_atomic; + static constexpr bool avoid_fences = t_avoid_fences; + static constexpr bool init_on_host = t_init_on_host; +}; + } // namespace hip namespace policy @@ -229,9 +240,9 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average /////////////////////////////////////////////////////////////////////// /// -template -struct hip_reduce_base + +template < typename tuning > +struct hip_reduce_policy : public RAJA:: make_policy_pattern_launch_platform_t; -using hip_reduce = hip_reduce_base; +template < bool maybe_atomic, + size_t replication = named_usage::unspecified, + size_t atomic_stride = named_usage::unspecified, + bool init_on_host = false, + bool avoid_fences = false > +using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning< + replication, atomic_stride, + maybe_atomic, init_on_host, avoid_fences> >; + +using hip_reduce_with_fences = hip_reduce_base; + +using hip_reduce_avoid_fences = hip_reduce_base; + +using hip_reduce_atomic_with_fences = hip_reduce_base; + +using hip_reduce_atomic_avoid_fences = hip_reduce_base; + +using hip_reduce_atomic_host_init = hip_reduce_base; + +#if defined(RAJA_USE_HIP_INTRINSICS) +using hip_reduce = hip_reduce_avoid_fences; +#else +using hip_reduce = hip_reduce_with_fences; +#endif -using hip_reduce_atomic = hip_reduce_base; +using hip_reduce_atomic = hip_reduce_atomic_host_init; // Policy for RAJA::statement::Reduce that reduces threads in a block @@ -1059,6 +1093,11 @@ using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; // policies usable with reducers +using policy::hip::hip_reduce_with_fences; +using policy::hip::hip_reduce_avoid_fences; +using policy::hip::hip_reduce_atomic_with_fences; +using policy::hip::hip_reduce_atomic_avoid_fences; +using policy::hip::hip_reduce_atomic_host_init; using policy::hip::hip_reduce_base; using policy::hip::hip_reduce; using policy::hip::hip_reduce_atomic; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 187de47ee2..6579633957 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -40,6 +40,7 @@ #include "RAJA/pattern/reduce.hpp" #include "RAJA/policy/hip/MemUtils_HIP.hpp" +#include "RAJA/policy/hip/intrinsics.hpp" #include "RAJA/policy/hip/atomic.hpp" #include "RAJA/policy/hip/policy.hpp" #include "RAJA/policy/hip/raja_hiperrchk.hpp" @@ -52,6 +53,7 @@ namespace reduce namespace hip { + //! atomic operator version of Combiner object template struct atomic; @@ -80,6 +82,22 @@ struct atomic> { } }; +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v); + } +}; + +template +struct atomic> { + RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) + { + RAJA::atomicOr(RAJA::hip_atomic{}, &val, v); + } +}; + template struct hip_atomic_available { static constexpr const bool value = @@ -97,264 +115,18 @@ namespace hip namespace impl { -/*! - * \brief Abstracts T into an equal or greater size array of integers whose - * size is between min_integer_type_size and max_interger_type_size inclusive. - */ -template -union AsIntegerArray { - - static_assert(min_integer_type_size <= max_integer_type_size, - "incompatible min and max integer type size"); - using integer_type = typename std::conditional< - ((alignof(T) >= alignof(long long) && - sizeof(long long) <= max_integer_type_size) || - sizeof(long) < min_integer_type_size), - long long, - typename std::conditional< - ((alignof(T) >= alignof(long) && - sizeof(long) <= max_integer_type_size) || - sizeof(int) < min_integer_type_size), - long, - typename std::conditional< - ((alignof(T) >= alignof(int) && - sizeof(int) <= max_integer_type_size) || - sizeof(short) < min_integer_type_size), - int, - typename std::conditional< - ((alignof(T) >= alignof(short) && - sizeof(short) <= max_integer_type_size) || - sizeof(char) < min_integer_type_size), - short, - typename std::conditional< - ((alignof(T) >= alignof(char) && - sizeof(char) <= max_integer_type_size)), - char, - void>::type>::type>::type>::type>::type; - static_assert(!std::is_same::value, - "could not find a compatible integer type"); - static_assert(sizeof(integer_type) >= min_integer_type_size, - "integer_type smaller than min integer type size"); - static_assert(sizeof(integer_type) <= max_integer_type_size, - "integer_type greater than max integer type size"); - - constexpr static size_t num_integer_type = - (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); - - T value; - integer_type array[num_integer_type]; - - RAJA_HOST_DEVICE constexpr AsIntegerArray(T value_) : value(value_){}; - - RAJA_HOST_DEVICE constexpr size_t array_size() const - { - return num_integer_type; - } -}; - -// hip only has shfl primitives for 32 bits -constexpr const size_t min_shfl_int_type_size = sizeof(int); -constexpr const size_t max_shfl_int_type_size = sizeof(int); - -/*! - ****************************************************************************** - * - * \brief Method to shuffle 32b registers in sum reduction for arbitrary type. - * - * \Note Returns an undefined value if src lane is inactive (divergence). - * Returns this lane's value if src lane is out of bounds or has exited. - * - ****************************************************************************** - */ -template -RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { - u.array[i] = ::__shfl_xor(u.array[i], laneMask); - } - return u.value; -} - -template -RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane) -{ - AsIntegerArray u(var); - - for (size_t i = 0; i < u.array_size(); ++i) { - u.array[i] = ::__shfl(u.array[i], srcLane); - } - return u.value; -} - - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_xor_sync(int var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_xor_sync(float var, int laneMask) -{ - return ::__shfl_xor(var, laneMask); -} - -template <> -RAJA_DEVICE RAJA_INLINE int shfl_sync(int var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - -template <> -RAJA_DEVICE RAJA_INLINE float shfl_sync(float var, int srcLane) -{ - return ::__shfl(var, srcLane); -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity)) -{ - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - T temp = val; - - if (numThreads % policy::hip::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - return temp; -} - -/*! - * Allreduce values in a warp. - * - * - * This does a butterfly pattern leaving each lane with the full reduction - * - */ -template -RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val) -{ - T temp = val; - - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - return temp; -} - - -//! reduce values in block into thread 0 -template -RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity) -{ - int numThreads = blockDim.x * blockDim.y * blockDim.z; - - int threadId = threadIdx.x + blockDim.x * threadIdx.y + - (blockDim.x * blockDim.y) * threadIdx.z; - - int warpId = threadId % policy::hip::WARP_SIZE; - int warpNum = threadId / policy::hip::WARP_SIZE; - - T temp = val; - - if (numThreads % policy::hip::WARP_SIZE == 0) { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - - } else { - - // reduce each warp - for (int i = 1; i < policy::hip::WARP_SIZE; i *= 2) { - int srcLane = threadId ^ i; - T rhs = shfl_sync(temp, srcLane); - // only add from threads that exist (don't double count own value) - if (srcLane < numThreads) { - Combiner{}(temp, rhs); - } - } - } - - // reduce per warp values - if (numThreads > policy::hip::WARP_SIZE) { - - static_assert(policy::hip::MAX_WARPS <= policy::hip::WARP_SIZE, - "Max Warps must be less than or equal to Warp Size for this algorithm to work"); - - __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray)]; - RAJA::detail::SoAArray* sd = - reinterpret_cast *>(tmpsd); - - // write per warp values to shared memory - if (warpId == 0) { - sd->set(warpNum, temp); - } - - __syncthreads(); - - if (warpNum == 0) { - - // read per warp values - if (warpId * policy::hip::WARP_SIZE < numThreads) { - temp = sd->get(warpId); - } else { - temp = identity; - } - - for (int i = 1; i < policy::hip::MAX_WARPS; i *= 2) { - T rhs = shfl_xor_sync(temp, i); - Combiner{}(temp, rhs); - } - } - - __syncthreads(); - } - - return temp; -} - - //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, T identity, - TempIterator device_mem, + TempIterator in_device_mem, unsigned int* device_count) { + typename TempIterator::template rebind_accessor device_mem(in_device_mem); + int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z; int numThreads = blockDim.x * blockDim.y * blockDim.z; @@ -388,7 +160,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, if (threadId == 0) { device_mem.set(blockSlot, temp); // ensure write visible to all threadblocks - __threadfence(); + Accessor::fence_release(); // increment counter, (wraps back to zero if old count == (numSlots-1)) unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1)); isLastBlock = (old_count == (numSlots-1)); @@ -400,6 +172,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, // last block accumulates values from device_mem if (isLastBlock) { temp = identity; + Accessor::fence_acquire(); for (unsigned int i = threadId; i < numSlots; @@ -526,6 +299,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red // last block accumulates values from device_mem if (lastBlock) { temp = OP::identity(); + __threadfence(); for (int i = threadId; i < numBlocks; i += numThreads) { temp = OP{}(temp, red.device_mem.get(i)); @@ -545,7 +319,9 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red //! reduce values in grid into thread 0 of last running block // returns true if put reduced value in val -template +template RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, T identity, T* device_mem, @@ -576,8 +352,8 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, if (threadId == 0) { unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u); if (old_val == 0u) { - device_mem[atomicOffset] = identity; // consider making this atomic - __threadfence(); + Accessor::set(device_mem, atomicOffset, identity); + Accessor::fence_release(); ::atomicAdd(&device_count[atomicOffset], 1u); } } @@ -590,34 +366,59 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, // wait for device_mem to be initialized while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u) ; - __threadfence(); + Accessor::fence_acquire(); RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); - __threadfence(); + Accessor::fence_release(); // increment counter, (wraps back to zero if old count == (numSlots+1)) unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1)); isLastBlock = (old_count == (numSlots+1)); // the last block for each replication gets the value from device_mem if (isLastBlock) { - val = device_mem[atomicOffset]; // consider making this atomic + Accessor::fence_acquire(); + val = Accessor::get(device_mem, atomicOffset); } } return isLastBlock ? replicationId : replication; } +//! reduce values in block into thread 0 and atomically combines into device_mem +template +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val, + T identity, + T* device_mem) +{ + int threadId = threadIdx.x + blockDim.x * threadIdx.y + + (blockDim.x * blockDim.y) * threadIdx.z; + + int blockId = blockIdx.x + gridDim.x * blockIdx.y + + (gridDim.x * gridDim.y) * blockIdx.z; + + int replicationId = (blockId%replication); + int atomicOffset = replicationId*atomic_stride; + + T temp = block_reduce(val, identity); + + // one thread per block performs an atomic on device_mem + if (threadId == 0 && temp != identity) { + RAJA::reduce::hip::atomic{}(device_mem[atomicOffset], temp); + } + +} + } // namespace impl //! Object that manages pinned memory buffers for reduction results // use one per reducer object -template +template class PinnedTally { public: //! Object put in Pinned memory with value and pointer to next Node struct Node { Node* next; - T values[replication]; + T values[num_slots]; }; //! Object per resource to keep track of pinned memory nodes struct ResourceNode { @@ -692,7 +493,7 @@ class PinnedTally return ret; } - auto operator*() -> T(&)[replication] { return m_n->values; } + auto operator*() -> T(&)[num_slots] { return m_n->values; } bool operator==(const ResourceNodeIterator& rhs) const { @@ -729,7 +530,7 @@ class PinnedTally ResourceNodeIterator end() { return {nullptr, nullptr}; } //! get new value for use in resource - auto new_value(::RAJA::resources::Hip res) -> T(&)[replication] + auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots] { #if defined(RAJA_ENABLE_OPENMP) lock_guard lock(m_mutex); @@ -746,7 +547,7 @@ class PinnedTally rn->node_list = nullptr; resource_list = rn; } - Node* n = hip::pinned_mempool_type::getInstance().template malloc(1); + Node* n = mempool::getInstance().template malloc(1); n->next = rn->node_list; rn->node_list = n; return n->values; @@ -769,7 +570,7 @@ class PinnedTally while (rn->node_list) { Node* n = rn->node_list; rn->node_list = n->next; - hip::pinned_mempool_type::getInstance().free(n); + mempool::getInstance().free(n); } resource_list = rn->next; free(rn); @@ -796,14 +597,20 @@ class PinnedTally //! Reduction data for Hip Offload -- stores value, host pointer, and device //! pointer -template -struct Reduce_Data { +struct Reduce_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; unsigned int* device_count; - RAJA::detail::SoAPtr device; + RAJA::detail::SoAPtr device; bool own_device_ptr; Reduce_Data() : Reduce_Data(T(), T()){}; @@ -836,9 +643,9 @@ struct Reduce_Data { //! initialize output to identity to ensure never read // uninitialized memory - T* init_grid_vals(T(&output)[replication]) + T* init_grid_vals(T(&output)[tally_slots]) { - for (size_t r = 0; r < replication; ++r) { + for (size_t r = 0; r < tally_slots; ++r) { output[r] = identity; } return &output[0]; @@ -849,9 +656,9 @@ struct Reduce_Data { void grid_reduce(T* output) { T temp = value; - - size_t replicationId = impl::grid_reduce( - temp, identity, device, device_count); + size_t replicationId = impl::grid_reduce< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); if (replicationId != replication) { output[replicationId] = temp; } @@ -867,7 +674,7 @@ struct Reduce_Data { size_t numBlocks = gridDim.x * gridDim.y * gridDim.z; size_t maxNumSlots = (numBlocks + replication - 1) / replication; device.allocate(maxNumSlots*replication); - device_count = device_zeroed_mempool_type::getInstance() + device_count = count_mempool_type::getInstance() .template malloc(replication*atomic_stride); own_device_ptr = true; } @@ -881,7 +688,7 @@ struct Reduce_Data { bool act = own_device_ptr; if (act) { device.deallocate(); - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; own_device_ptr = false; } @@ -893,7 +700,93 @@ struct Reduce_Data { //! Reduction data for Hip Offload -- stores value, host pointer template -struct ReduceAtomic_Data { +struct ReduceAtomicInitialized_Data +{ + using tally_mempool_type = device_pinned_mempool_type; + + static constexpr size_t tally_slots = replication * atomic_stride; + + mutable T value; + T identity; + bool is_setup; + bool own_device_ptr; + + ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){} + + ReduceAtomicInitialized_Data(T initValue, T identity_) + : value{initValue}, + identity{identity_}, + is_setup{false}, + own_device_ptr{false} + { + } + + RAJA_HOST_DEVICE + ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other) + : value{other.identity}, + identity{other.identity}, + is_setup{other.is_setup}, + own_device_ptr{false} + { + } + + ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default; + + //! initialize output to identity to ensure never read + // uninitialized memory + T* init_grid_vals(T(&output)[tally_slots]) + { + for (size_t r = 0; r < tally_slots; ++r) { + output[r] = identity; + } + return &output[0]; + } + + //! reduce values in grid to single value, store in output + RAJA_DEVICE + void grid_reduce(T* output) + { + T temp = value; + + impl::grid_reduce_atomic_initialized( + temp, identity, output); + } + + //! check and setup for device + // allocate device pointers and get a new result buffer from the pinned tally + bool setupForDevice() + { + bool act = !is_setup && setupReducers(); + if (act) { + is_setup = true; + own_device_ptr = true; + } + return act; + } + + //! if own resources teardown device setup + // free device pointers + bool teardownForDevice() + { + bool act = own_device_ptr; + if (act) { + is_setup = false; + own_device_ptr = false; + } + return act; + } +}; + +//! Reduction data for Hip Offload -- stores value, host pointer +template +struct ReduceAtomic_Data +{ + using tally_mempool_type = pinned_mempool_type; + using data_mempool_type = device_mempool_type; + using count_mempool_type = device_zeroed_mempool_type; + + static constexpr size_t tally_slots = replication; mutable T value; T identity; @@ -926,9 +819,9 @@ struct ReduceAtomic_Data { //! initialize output to identity to ensure never read // uninitialized memory - T* init_grid_vals(T(&output)[replication]) + T* init_grid_vals(T(&output)[tally_slots]) { - for (size_t r = 0; r < replication; ++r) { + for (size_t r = 0; r < tally_slots; ++r) { output[r] = identity; } return &output[0]; @@ -940,8 +833,9 @@ struct ReduceAtomic_Data { { T temp = value; - size_t replicationId = impl::grid_reduce_atomic( - temp, identity, device, device_count); + size_t replicationId = impl::grid_reduce_atomic< + Combiner, Accessor, replication, atomic_stride>( + temp, identity, device, device_count); if (replicationId != replication) { output[replicationId] = temp; } @@ -953,8 +847,8 @@ struct ReduceAtomic_Data { { bool act = !device && setupReducers(); if (act) { - device = device_mempool_type::getInstance().template malloc(replication*atomic_stride); - device_count = device_zeroed_mempool_type::getInstance() + device = data_mempool_type::getInstance().template malloc(replication*atomic_stride); + device_count = count_mempool_type::getInstance() .template malloc(replication*atomic_stride); own_device_ptr = true; } @@ -967,9 +861,9 @@ struct ReduceAtomic_Data { { bool act = own_device_ptr; if (act) { - device_mempool_type::getInstance().free(device); + data_mempool_type::getInstance().free(device); device = nullptr; - device_zeroed_mempool_type::getInstance().free(device_count); + count_mempool_type::getInstance().free(device_count); device_count = nullptr; own_device_ptr = false; } @@ -977,11 +871,47 @@ struct ReduceAtomic_Data { } }; + //! Hip Reduction entity -- generalize on reduction, and type -template +template class Reduce { + static constexpr size_t replication = (tuning::replication > 0) + ? tuning::replication + : 32; + static constexpr size_t atomic_stride = (tuning::atomic_stride > 0) + ? tuning::atomic_stride + : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) + ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) + : 1); + + static constexpr bool use_atomic = tuning::maybe_atomic && + RAJA::reduce::hip::hip_atomic_available::value; + + using Accessor = std::conditional_t; + + //! hip reduction data storage class and folding algorithm + using reduce_data_type = std::conditional_t, + hip::ReduceAtomic_Data>, + hip::Reduce_Data>; + + static constexpr size_t tally_slots = reduce_data_type::tally_slots; + + using TallyType = PinnedTally; + + //! union to hold either pointer to PinnedTally or pointer to value + // only use list before setup for device and only use val_ptr after + union tally_u { + TallyType* list; + T* val_ptr; + constexpr tally_u(TallyType* l) : list(l){}; + constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; + }; + public: Reduce() : Reduce(T(), Combiner::identity()) {} @@ -989,7 +919,7 @@ class Reduce // the original object's parent is itself explicit Reduce(T init_val, T identity_ = Combiner::identity()) : parent{this}, - tally_or_val_ptr{new PinnedTally}, + tally_or_val_ptr{new TallyType}, val(init_val, identity_) { } @@ -1062,8 +992,8 @@ class Reduce if (n != end) { tally_or_val_ptr.list->synchronize_resources(); for (; n != end; ++n) { - T(&values)[replication] = *n; - for (size_t r = 0; r < replication; ++r) { + T(&values)[tally_slots] = *n; + for (size_t r = 0; r < tally_slots; ++r) { Combiner{}(val.value, values[r]); } } @@ -1087,47 +1017,20 @@ class Reduce private: const Reduce* parent; - - static constexpr size_t replication = (t_replication > 0) - ? t_replication - : 32; - static constexpr size_t atomic_stride = (t_atomic_stride > 0) - ? t_atomic_stride - : ((policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T)) - ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) - : 1); - - //! union to hold either pointer to PinnedTally or poiter to value - // only use list before setup for device and only use val_ptr after - union tally_u { - PinnedTally* list; - T* val_ptr; - constexpr tally_u(PinnedTally* l) : list(l){}; - constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){}; - }; - tally_u tally_or_val_ptr; - - //! hip reduction data storage class and folding algorithm - using reduce_data_type = typename std::conditional< - maybe_atomic && RAJA::reduce::hip::hip_atomic_available::value, - hip::ReduceAtomic_Data, - hip::Reduce_Data>::type; - - //! storage for reduction data reduce_data_type val; }; } // end namespace hip //! specialization of ReduceSum for hip_reduce -template -class ReduceSum, T> - : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceSum, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator+= for ReduceSum -- alias for combine() RAJA_HOST_DEVICE @@ -1139,13 +1042,13 @@ class ReduceSum, T> }; //! specialization of ReduceBitOr for hip_reduce -template -class ReduceBitOr, T> - : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceBitOr, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator|= for ReduceBitOr -- alias for combine() RAJA_HOST_DEVICE @@ -1157,13 +1060,13 @@ class ReduceBitOr, T> }; //! specialization of ReduceBitAnd for hip_reduce -template -class ReduceBitAnd, T> - : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceBitAnd, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable operator&= for ReduceBitAnd -- alias for combine() RAJA_HOST_DEVICE @@ -1175,13 +1078,13 @@ class ReduceBitAnd, T> }; //! specialization of ReduceMin for hip_reduce -template -class ReduceMin, T> - : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceMin, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable min() for ReduceMin -- alias for combine() RAJA_HOST_DEVICE @@ -1193,13 +1096,13 @@ class ReduceMin, T> }; //! specialization of ReduceMax for hip_reduce -template -class ReduceMax, T> - : public hip::Reduce, T, maybe_atomic, replication, atomic_stride> +template +class ReduceMax, T> + : public hip::Reduce, T, tuning> { public: - using Base = hip::Reduce, T, maybe_atomic, replication, atomic_stride>; + using Base = hip::Reduce, T, tuning>; using Base::Base; //! enable max() for ReduceMax -- alias for combine() RAJA_HOST_DEVICE @@ -1211,18 +1114,18 @@ class ReduceMax, T> }; //! specialization of ReduceMinLoc for hip_reduce -template -class ReduceMinLoc, T, IndexType> +template +class ReduceMinLoc, T, IndexType> : public hip::Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic, replication, atomic_stride> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::min; using NonLocCombiner = RAJA::reduce::min; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer @@ -1261,18 +1164,18 @@ class ReduceMinLoc, T, }; //! specialization of ReduceMaxLoc for hip_reduce -template -class ReduceMaxLoc, T, IndexType> +template +class ReduceMaxLoc, T, IndexType> : public hip:: Reduce>, RAJA::reduce::detail::ValueLoc, - maybe_atomic, replication, atomic_stride> + tuning> { public: using value_type = RAJA::reduce::detail::ValueLoc; using Combiner = RAJA::reduce::max; using NonLocCombiner = RAJA::reduce::max; - using Base = hip::Reduce; + using Base = hip::Reduce; using Base::Base; //! constructor requires a default value for the reducer diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp index fc83f8999b..dc3caf86ef 100644 --- a/include/RAJA/util/macros.hpp +++ b/include/RAJA/util/macros.hpp @@ -56,6 +56,8 @@ #define RAJA_HOST __host__ #define RAJA_SUPPRESS_HD_WARN +#define RAJA_USE_HIP_INTRINSICS + #else #define RAJA_HOST_DEVICE diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index f19d9947b6..8441f75522 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -887,6 +887,75 @@ struct DefaultAccessor } }; + +/*! + * \brief Abstracts T into an equal or greater size array of integers whose + * size is between min_integer_type_size and max_interger_type_size inclusive. + */ +template +struct AsIntegerArray +{ + static_assert(min_integer_type_size <= max_integer_type_size, + "incompatible min and max integer type size"); + using integer_type = typename std::conditional< + ((alignof(T) >= alignof(unsigned long long) && + sizeof(unsigned long long) <= max_integer_type_size) || + sizeof(unsigned long) < min_integer_type_size), + unsigned long long, + typename std::conditional< + ((alignof(T) >= alignof(unsigned long) && + sizeof(unsigned long) <= max_integer_type_size) || + sizeof(unsigned int) < min_integer_type_size), + unsigned long, + typename std::conditional< + ((alignof(T) >= alignof(unsigned int) && + sizeof(unsigned int) <= max_integer_type_size) || + sizeof(unsigned short) < min_integer_type_size), + unsigned int, + typename std::conditional< + ((alignof(T) >= alignof(unsigned short) && + sizeof(unsigned short) <= max_integer_type_size) || + sizeof(unsigned char) < min_integer_type_size), + unsigned short, + typename std::conditional< + ((alignof(T) >= alignof(unsigned char) && + sizeof(unsigned char) <= max_integer_type_size)), + unsigned char, + void>::type>::type>::type>::type>::type; + static_assert(!std::is_same::value, + "could not find a compatible integer type"); + static_assert(sizeof(integer_type) >= min_integer_type_size, + "integer_type smaller than min integer type size"); + static_assert(sizeof(integer_type) <= max_integer_type_size, + "integer_type greater than max integer type size"); + + static constexpr size_t num_integer_type = + (sizeof(T) + sizeof(integer_type) - 1) / sizeof(integer_type); + + integer_type array[num_integer_type] = {0}; + + AsIntegerArray() = default; + + RAJA_HOST_DEVICE constexpr size_t array_size() const + { + return num_integer_type; + } + + RAJA_HOST_DEVICE constexpr T get_value() const + { + T value; + memcpy(&value, &array[0], sizeof(T)); + return value; + } + + RAJA_HOST_DEVICE constexpr void set_value(T value) + { + memcpy(&array[0], &value, sizeof(T)); + } +}; + } // namespace detail } // namespace RAJA diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp index d8d5fc670b..f6a5306c84 100644 --- a/test/include/RAJA_test-reducepol.hpp +++ b/test/include/RAJA_test-reducepol.hpp @@ -34,11 +34,19 @@ using OpenMPTargetReducePols = #endif #if defined(RAJA_ENABLE_CUDA) -using CudaReducePols = camp::list< RAJA::cuda_reduce >; +using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences, + RAJA::cuda_reduce_avoid_fences, + RAJA::cuda_reduce_atomic_with_fences, + RAJA::cuda_reduce_atomic_avoid_fences, + RAJA::cuda_reduce_atomic_host_init >; #endif #if defined(RAJA_ENABLE_HIP) -using HipReducePols = camp::list< RAJA::hip_reduce >; +using HipReducePols = camp::list< RAJA::hip_reduce_with_fences, + RAJA::hip_reduce_avoid_fences, + RAJA::hip_reduce_atomic_with_fences, + RAJA::hip_reduce_atomic_avoid_fences, + RAJA::hip_reduce_atomic_host_init >; #endif #if defined(RAJA_ENABLE_SYCL) From a6da3e896a487cc115c379db2adf578558d3324c Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 7 Apr 2024 21:03:40 -0700 Subject: [PATCH 065/108] Add reduce atomic host with/avoid fences This allows the choice of which fallback non-atomic policy is used --- include/RAJA/policy/cuda/policy.hpp | 4 +++- include/RAJA/policy/hip/policy.hpp | 4 +++- test/include/RAJA_test-reducepol.hpp | 6 ++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index b3b8ae04d1..c7815ecfa4 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -288,7 +288,9 @@ using cuda_reduce_atomic_with_fences = cuda_reduce_base; -using cuda_reduce_atomic_host_init = cuda_reduce_base; +using cuda_reduce_atomic_host_with_fences = cuda_reduce_base; + +using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base; using cuda_reduce = cuda_reduce_with_fences; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 6a53e91177..e89c4e16ad 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -280,7 +280,9 @@ using hip_reduce_atomic_with_fences = hip_reduce_base; -using hip_reduce_atomic_host_init = hip_reduce_base; +using hip_reduce_atomic_host_with_fences = hip_reduce_base; + +using hip_reduce_atomic_host_avoid_fences = hip_reduce_base; #if defined(RAJA_USE_HIP_INTRINSICS) using hip_reduce = hip_reduce_avoid_fences; diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp index f6a5306c84..cd97a686ca 100644 --- a/test/include/RAJA_test-reducepol.hpp +++ b/test/include/RAJA_test-reducepol.hpp @@ -38,7 +38,8 @@ using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences, RAJA::cuda_reduce_avoid_fences, RAJA::cuda_reduce_atomic_with_fences, RAJA::cuda_reduce_atomic_avoid_fences, - RAJA::cuda_reduce_atomic_host_init >; + RAJA::cuda_reduce_atomic_host_with_fences, + RAJA::cuda_reduce_atomic_host_avoid_fences >; #endif #if defined(RAJA_ENABLE_HIP) @@ -46,7 +47,8 @@ using HipReducePols = camp::list< RAJA::hip_reduce_with_fences, RAJA::hip_reduce_avoid_fences, RAJA::hip_reduce_atomic_with_fences, RAJA::hip_reduce_atomic_avoid_fences, - RAJA::hip_reduce_atomic_host_init >; + RAJA::hip_reduce_atomic_host_with_fences, + RAJA::hip_reduce_atomic_host_avoid_fences >; #endif #if defined(RAJA_ENABLE_SYCL) From c418614864dde1b045e9042ecaf2ccaa5f02a8fa Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 7 Apr 2024 21:04:08 -0700 Subject: [PATCH 066/108] change default cuda/hip reduction policies --- include/RAJA/policy/cuda/policy.hpp | 2 +- include/RAJA/policy/hip/policy.hpp | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index c7815ecfa4..ce3b10b708 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -294,7 +294,7 @@ using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base; -#if defined(RAJA_USE_HIP_INTRINSICS) using hip_reduce = hip_reduce_avoid_fences; -#else -using hip_reduce = hip_reduce_with_fences; -#endif -using hip_reduce_atomic = hip_reduce_atomic_host_init; +using hip_reduce_atomic = hip_reduce_atomic_avoid_fences; // Policy for RAJA::statement::Reduce that reduces threads in a block From 18904b6d40dcf92a654852668a78a609a53b8322 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 7 Apr 2024 21:13:06 -0700 Subject: [PATCH 067/108] fixup atomic host policies --- include/RAJA/policy/cuda/policy.hpp | 3 ++- include/RAJA/policy/hip/policy.hpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index ce3b10b708..5b1f3a00fb 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1177,7 +1177,8 @@ using policy::cuda::cuda_reduce_with_fences; using policy::cuda::cuda_reduce_avoid_fences; using policy::cuda::cuda_reduce_atomic_with_fences; using policy::cuda::cuda_reduce_atomic_avoid_fences; -using policy::cuda::cuda_reduce_atomic_host_init; +using policy::cuda::cuda_reduce_atomic_host_with_fences; +using policy::cuda::cuda_reduce_atomic_host_avoid_fences; using policy::cuda::cuda_reduce_base; using policy::cuda::cuda_reduce; using policy::cuda::cuda_reduce_atomic; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 379924e71c..c9c42f881a 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -1095,7 +1095,8 @@ using policy::hip::hip_reduce_with_fences; using policy::hip::hip_reduce_avoid_fences; using policy::hip::hip_reduce_atomic_with_fences; using policy::hip::hip_reduce_atomic_avoid_fences; -using policy::hip::hip_reduce_atomic_host_init; +using policy::hip::hip_reduce_atomic_host_with_fences; +using policy::hip::hip_reduce_atomic_host_avoid_fences; using policy::hip::hip_reduce_base; using policy::hip::hip_reduce; using policy::hip::hip_reduce_atomic; From 4ffd37a154b1e9243a41e8df49f4e99fcb333ca2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Sun, 7 Apr 2024 21:25:32 -0700 Subject: [PATCH 068/108] Fix argument ordering --- include/RAJA/policy/cuda/policy.hpp | 2 +- include/RAJA/policy/hip/policy.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 5b1f3a00fb..ea4c5ca1c7 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -278,7 +278,7 @@ template < bool maybe_atomic, bool avoid_fences = false > using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning< replication, atomic_stride, - maybe_atomic, init_on_host, avoid_fences> >; + maybe_atomic, avoid_fences, init_on_host> >; using cuda_reduce_with_fences = cuda_reduce_base; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index c9c42f881a..7bcb01c039 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -270,7 +270,7 @@ template < bool maybe_atomic, bool avoid_fences = false > using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning< replication, atomic_stride, - maybe_atomic, init_on_host, avoid_fences> >; + maybe_atomic, avoid_fences, init_on_host> >; using hip_reduce_with_fences = hip_reduce_base; From c0595f61b509b4d4acda82e4ec75bf6824e1d427 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 8 Apr 2024 08:02:20 -0700 Subject: [PATCH 069/108] Adjust default policies again --- include/RAJA/policy/cuda/policy.hpp | 2 +- include/RAJA/policy/hip/policy.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index ea4c5ca1c7..7bd895a2bc 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -294,7 +294,7 @@ using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base Date: Wed, 17 Apr 2024 08:08:17 -0700 Subject: [PATCH 070/108] Add check for specific hip builtins --- include/RAJA/policy/hip/intrinsics.hpp | 9 +++++---- include/RAJA/util/macros.hpp | 7 +++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp index 374a66323e..fe3ac0f35d 100644 --- a/include/RAJA/policy/hip/intrinsics.hpp +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -91,7 +91,7 @@ struct AccessorAvoidingFences auto ptr = const_cast(reinterpret_cast(in_ptr + idx)); for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(RAJA_USE_HIP_INTRINSICS) +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load) u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); #else u.array[i] = atomicAdd(&ptr[i], integer_type(0)); @@ -112,7 +112,7 @@ struct AccessorAvoidingFences auto ptr = reinterpret_cast(in_ptr + idx); for (size_t i = 0; i < u.array_size(); ++i) { -#if defined(RAJA_USE_HIP_INTRINSICS) +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store) __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); #else atomicExch(&ptr[i], u.array[i]); @@ -122,7 +122,7 @@ struct AccessorAvoidingFences static RAJA_DEVICE RAJA_INLINE void fence_acquire() { -#if defined(RAJA_USE_HIP_INTRINSICS) +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); #else __threadfence(); @@ -131,7 +131,8 @@ struct AccessorAvoidingFences static RAJA_DEVICE RAJA_INLINE void fence_release() { -#if defined(RAJA_USE_HIP_INTRINSICS) +#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \ + RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt) __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); // Wait until all vmem operations complete (s_waitcnt vmcnt(0)) __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8)); diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp index dc3caf86ef..55e90010d8 100644 --- a/include/RAJA/util/macros.hpp +++ b/include/RAJA/util/macros.hpp @@ -66,6 +66,13 @@ #define RAJA_SUPPRESS_HD_WARN #endif + +#if defined(__has_builtin) +#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) __has_builtin(x) +#else +#define RAJA_INTERNAL_CLANG_HAS_BUILTIN(x) 0 +#endif + /*! ******************************************************************************* * \def RAJA_USED_ARG(x) From 93ea8878762a501eedc7b56f7d5b95d291641b10 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 18 Apr 2024 14:21:21 -0700 Subject: [PATCH 071/108] Add RAJA::binary_tree_reduce RAJA::binary_tree_reduceThis is a more accurate option when adding many floating point numbers that uses a binary reduction tree pattern. RAJA::accumulate is also added which adds numbers into a single counter. --- include/RAJA/RAJA.hpp | 6 + include/RAJA/util/Operators.hpp | 21 +- include/RAJA/util/math.hpp | 75 ++++ include/RAJA/util/reduce.hpp | 400 ++++++++++++++++++ include/RAJA/util/sort.hpp | 21 +- test/unit/algorithm/CMakeLists.txt | 56 ++- .../test-algorithm-util-reduce.cpp.in | 36 ++ .../algorithm/test-algorithm-util-sort.cpp.in | 12 +- .../tests/test-algorithm-reduce-utils.hpp | 350 +++++++++++++++ .../tests/test-algorithm-util-reduce.hpp | 205 +++++++++ 10 files changed, 1135 insertions(+), 47 deletions(-) create mode 100644 include/RAJA/util/math.hpp create mode 100644 include/RAJA/util/reduce.hpp create mode 100644 test/unit/algorithm/test-algorithm-util-reduce.cpp.in create mode 100644 test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp create mode 100644 test/unit/algorithm/tests/test-algorithm-util-reduce.hpp diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index 32522a1f0d..5478392ff1 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -33,6 +33,7 @@ #include "RAJA/util/camp_aliases.hpp" #include "RAJA/util/macros.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/math.hpp" #include "RAJA/util/plugins.hpp" #include "RAJA/util/Registry.hpp" #include "RAJA/util/for_each.hpp" @@ -156,6 +157,11 @@ // #include "RAJA/util/sort.hpp" +// +// reduce algorithms +// +#include "RAJA/util/reduce.hpp" + // // WorkPool, WorkGroup, WorkSite objects // diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp index d76b862c22..b4249e7182 100644 --- a/include/RAJA/util/Operators.hpp +++ b/include/RAJA/util/Operators.hpp @@ -42,9 +42,20 @@ namespace operators namespace detail { +// truly associative (does not include fp add/multiply) struct associative_tag { }; +// associative up to floating point rounding differences +struct fp_associative_tag : associative_tag { +}; + +// get associativity tag appropriate for the type +template < typename T > +using associative_or_fp_associative_tag = + std::conditional_t>::value, + fp_associative_tag, associative_tag>; + template struct binary_function { using first_argument_type = Arg1; @@ -327,7 +338,7 @@ static_assert(check(), template struct plus : public detail::binary_function, - detail::associative_tag { + detail::associative_or_fp_associative_tag { RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs, const Arg2& rhs) const { @@ -347,7 +358,7 @@ struct minus : public detail::binary_function { template struct multiplies : public detail::binary_function, - detail::associative_tag { + detail::associative_or_fp_associative_tag { RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs, const Arg2& rhs) const @@ -569,6 +580,12 @@ struct is_associative { std::is_base_of::value; }; +template +struct is_fp_associative { + static constexpr const bool value = + std::is_base_of::value; +}; + template struct safe_plus : public plus +#include + +namespace RAJA +{ + +/*! + \brief evaluate log base 2 of n + + For positive n calculate log base 2 of n, and round the result down to the + nearest integer. + For zero or negative n return 0 + +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +RAJA_HOST_DEVICE RAJA_INLINE +constexpr T log2(T n) noexcept +{ + T result = 0; + if (n > 0) { + while(n >>= 1) { + ++result; + } + } + return result; +} + +/*! + \brief "round up" to the next greatest power of 2 + + For a integer n, + if n is non-negative, + if n is a power of 2, return n + if n is not a power of 2, return the next greater power of 2 + if n is negative, return 0 +*/ +template < typename T, + std::enable_if_t::value>* = nullptr > +RAJA_HOST_DEVICE +constexpr T next_pow2(T n) noexcept +{ + --n; + for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) { + n |= n >> s; + } + ++n; + return n; +} + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp new file mode 100644 index 0000000000..6d0c28f861 --- /dev/null +++ b/include/RAJA/util/reduce.hpp @@ -0,0 +1,400 @@ +/*! +****************************************************************************** +* +* \file +* +* \brief Header file providing RAJA sort templates. +* +****************************************************************************** +*/ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#ifndef RAJA_util_reduce_HPP +#define RAJA_util_reduce_HPP + +#include "RAJA/config.hpp" + +#include +#include +#include +#include + +#include "RAJA/pattern/detail/algorithm.hpp" + +#include "RAJA/util/macros.hpp" +#include "RAJA/util/concepts.hpp" +#include "RAJA/util/math.hpp" +#include "RAJA/util/Operators.hpp" + +namespace RAJA +{ + +namespace detail +{ + +/*! + \brief Reduce class that does a reduction with a left fold. +*/ +template +struct LeftFoldReduce +{ + RAJA_HOST_DEVICE RAJA_INLINE + constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(), + BinaryOp op = BinaryOp{}) noexcept + : m_op(std::move(op)) + , m_accumulated_value(std::move(init)) + { + + } + + LeftFoldReduce(LeftFoldReduce const&) = delete; + LeftFoldReduce& operator=(LeftFoldReduce const&) = delete; + LeftFoldReduce(LeftFoldReduce &&) = delete; + LeftFoldReduce& operator=(LeftFoldReduce &&) = delete; + + ~LeftFoldReduce() = default; + + + /*! + \brief reset the combined value of the reducer to the identity + */ + RAJA_HOST_DEVICE RAJA_INLINE + void clear() noexcept + { + m_accumulated_value = BinaryOp::identity(); + } + + /*! + \brief return the combined value and clear the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get_and_clear() + { + T accumulated_value = std::move(m_accumulated_value); + + clear(); + + return accumulated_value; + } + + /*! + \brief return the combined value + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get() + { + return m_accumulated_value; + } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + void combine(T val) + { + m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val)); + } + +private: + BinaryOp m_op; + T m_accumulated_value; +}; + +/*! + \brief Reduce class that does a reduction with a binary tree. +*/ +template +struct BinaryTreeReduce +{ + static_assert(std::is_unsigned::value, "SizeType must be unsigned"); + static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels"); + + static constexpr SizeType num_levels = t_num_levels; + + RAJA_HOST_DEVICE RAJA_INLINE + constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(), + BinaryOp op = BinaryOp{}) noexcept + : m_op(std::move(op)) + { + combine(std::move(init)); + } + + BinaryTreeReduce(BinaryTreeReduce const&) = delete; + BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete; + BinaryTreeReduce(BinaryTreeReduce &&) = delete; + BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete; + + RAJA_HOST_DEVICE RAJA_INLINE + ~BinaryTreeReduce() + { + clear(); + } + + + /*! + \brief reset the combined value of the reducer to the identity + */ + RAJA_HOST_DEVICE RAJA_INLINE + void clear() noexcept + { + // destroy all values on the tree stack and reset count to 0 + for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) { + + if (m_count & mask) { + + get_value(level)->~T(); + + m_count ^= mask; + + } + } + } + + /*! + \brief return the combined value and clear the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get_and_clear() + { + // accumulate all values + T value = BinaryOp::identity(); + + for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) { + + if (m_count & mask) { + + value = m_op(std::move(value), std::move(*get_value(level))); + get_value(level)->~T(); + + m_count ^= mask; + } + } + + return value; + } + + /*! + \brief return the combined value + */ + RAJA_HOST_DEVICE RAJA_INLINE + T get() + { + // accumulate all values + T value = BinaryOp::identity(); + + for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) { + + if (count & mask) { + + value = m_op(std::move(value), *get_value(level)); + + count ^= mask; + } + } + + return value; + } + + /*! + \brief combine a value into the reducer + */ + RAJA_HOST_DEVICE RAJA_INLINE + void combine(T value) + { + // accumulate values and store in the first unused level found + // clear values from used levels along the way + SizeType level = 0; + for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) { + + value = m_op(std::move(*get_value(level)), std::move(value)); + get_value(level)->~T(); + + } + + new(get_storage(level)) T(std::move(value)); + + ++m_count; + } + +private: + BinaryOp m_op; + + // A counter of the number of inputs combined. + // The bits of count indicate which levels of tree stack have a value + SizeType m_count = 0; + + // Each level in tree stack has a value that holds the accumulation of 2^level + // values or is unused and has no value. + std::aligned_storage_t m_tree_stack[num_levels]; + + RAJA_HOST_DEVICE RAJA_INLINE + void* get_storage(SizeType level) + { + return &m_tree_stack[level]; + } + + RAJA_HOST_DEVICE RAJA_INLINE + T* get_value(SizeType level) + { +#if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) + // TODO: check that launder is supported in device code + return std::launder(reinterpret_cast(&m_tree_stack[level])); +#else + return reinterpret_cast(&m_tree_stack[level]); +#endif + } +}; + + +template +using HighAccuracyReduce = std::conditional_t< + RAJA::operators::is_fp_associative::value, + BinaryTreeReduce, + LeftFoldReduce>; + + +/*! + \brief Combine into a single value using a left fold with the given + operation using O(N) operations and O(1) memory +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T left_fold_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + LeftFoldReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + +/*! + \brief reduce using a binary tree with the given operation + and using O(N) operations and O(lg(n)) memory + + This is more accurate than sequentially adding into a single value for + floating point types. +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T binary_tree_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + using std::distance; + using SizeType = std::make_unsigned_t; + BinaryTreeReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + + +/*! + \brief reducer that uses a high accuracy implementation when round-off error + is a concern, or a faster algorithm with it is not a concern +*/ +template +RAJA_HOST_DEVICE RAJA_INLINE +T high_accuracy_reduce(Iter begin, + Iter end, + T init, + BinaryOp op) +{ + HighAccuracyReduce reducer(std::move(init), std::move(op)); + + for (; begin != end; ++begin) { + + reducer.combine(*begin); + + } + + return reducer.get_and_clear(); +} + +} // namespace detail + +/*! + \brief Accumulate given range to a single value + using a left fold algorithm in O(N) operations and O(1) extra memory + see https://en.cppreference.com/w/cpp/algorithm/accumulate +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +/*! + \brief Reduce given range to a single value + using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory + see https://en.cppreference.com/w/cpp/algorithm/reduce +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +/*! + \brief Reduce given range to a single value + using an algorithm with high accuracy when floating point round off is a + concern + see https://en.cppreference.com/w/cpp/algorithm/reduce +*/ +template , + typename BinaryOp = operators::plus> +RAJA_HOST_DEVICE RAJA_INLINE +concepts::enable_if_t> + high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{}) +{ + using std::begin; + using std::end; + static_assert(type_traits::is_binary_function::value, + "BinaryOp must model BinaryFunction"); + + return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op)); +} + +} // namespace RAJA + +#endif diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp index f1eebfc282..bbec03dfe1 100644 --- a/include/RAJA/util/sort.hpp +++ b/include/RAJA/util/sort.hpp @@ -26,8 +26,8 @@ #include "RAJA/pattern/detail/algorithm.hpp" #include "RAJA/util/macros.hpp" - #include "RAJA/util/concepts.hpp" +#include "RAJA/util/math.hpp" namespace RAJA { @@ -35,23 +35,6 @@ namespace RAJA namespace detail { -/*! - \brief evaluate log base 2 of N rounded down to the nearest integer >= 0 -*/ -RAJA_HOST_DEVICE RAJA_INLINE -unsigned -ulog2(size_t N) -{ - unsigned val = 0; - - while (N > 1) { - val += 1; - N >>= 1; - } - - return val; -} - /*! \brief unstable partition given range inplace using predicate function and using O(N) predicate evaluations and O(1) memory @@ -426,7 +409,7 @@ intro_sort(Iter begin, auto N = end - begin; // set max depth to 2*lg(N) - unsigned max_depth = 2*detail::ulog2(N); + unsigned max_depth = 2*RAJA::log2(N); #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // limit max_depth statically in device code to allow compiler to remove recursion diff --git a/test/unit/algorithm/CMakeLists.txt b/test/unit/algorithm/CMakeLists.txt index 0142a94ed3..ea93727d59 100644 --- a/test/unit/algorithm/CMakeLists.txt +++ b/test/unit/algorithm/CMakeLists.txt @@ -48,46 +48,62 @@ foreach( SORT_BACKEND ${SORT_BACKENDS} ) endforeach() -set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge ) -set( CUDA_UTIL_SORTS Shell Heap Intro ) -set( HIP_UTIL_SORTS Shell Heap Intro ) -macro(RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS SORT_BACKEND_in SORT_SIZE_in UTIL_SORTS) - set( SORT_BACKEND ${SORT_BACKEND_in} ) - set( SORT_SIZE ${SORT_SIZE_in} ) - foreach( UTIL_SORT ${UTIL_SORTS} ) - configure_file( test-algorithm-util-sort.cpp.in - test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp ) +macro(RAJA_GENERATE_ALGORITHM_UTIL_TESTS ALG ALG_BACKEND_in ALG_SIZE_in UTIL_ALGS) + set( ALG_BACKEND ${ALG_BACKEND_in} ) + set( ALG_SIZE ${ALG_SIZE_in} ) + foreach( UTIL_ALG ${UTIL_ALGS} ) + configure_file( test-algorithm-util-${ALG}.cpp.in + test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp ) - raja_add_test( NAME test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND} - SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.cpp ) + raja_add_test( NAME test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND} + SOURCES ${CMAKE_CURRENT_BINARY_DIR}/test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.cpp ) - target_include_directories(test-algorithm-util-sort-${UTIL_SORT}-${SORT_BACKEND}.exe + target_include_directories(test-algorithm-util-${ALG}-${UTIL_ALG}-${ALG_BACKEND}.exe PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/tests) endforeach() - unset( SORT_SIZE ) - unset( SORT_BACKEND ) + unset( ALG_SIZE ) + unset( ALG_BACKEND ) endmacro() -RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Default "${SEQUENTIAL_UTIL_SORTS}" ) -RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Sequential Small "Insertion" ) +set( SEQUENTIAL_UTIL_SORTS Shell Heap Intro Merge ) +set( CUDA_UTIL_SORTS Shell Heap Intro ) +set( HIP_UTIL_SORTS Shell Heap Intro ) + +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Default "${SEQUENTIAL_UTIL_SORTS}" ) +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Sequential Small "Insertion" ) if(RAJA_ENABLE_CUDA) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Small "${CUDA_UTIL_SORTS}" ) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Cuda Tiny "Insertion" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Small "${CUDA_UTIL_SORTS}" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Cuda Tiny "Insertion" ) endif() if(RAJA_ENABLE_HIP) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Small "${HIP_UTIL_SORTS}" ) - RAJA_GENERATE_ALGORITHM_UTIL_SORT_TESTS( Hip Tiny "Insertion" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Small "${HIP_UTIL_SORTS}" ) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( sort Hip Tiny "Insertion" ) endif() + +set( UTIL_REDUCES BinaryTree Accumulate ) + +RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Sequential Default "${UTIL_REDUCES}" ) + +if(RAJA_ENABLE_CUDA) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Cuda Small "${UTIL_REDUCES}" ) +endif() + +if(RAJA_ENABLE_HIP) + RAJA_GENERATE_ALGORITHM_UTIL_TESTS( reduce Hip Small "${UTIL_REDUCES}" ) +endif() + + unset( SORT_BACKENDS ) unset( SEQUENTIAL_UTIL_SORTS ) unset( CUDA_UTIL_SORTS ) unset( HIP_UTIL_SORTS ) +unset( UTIL_REDUCES ) raja_add_test( diff --git a/test/unit/algorithm/test-algorithm-util-reduce.cpp.in b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in new file mode 100644 index 0000000000..d7dd20bcd2 --- /dev/null +++ b/test/unit/algorithm/test-algorithm-util-reduce.cpp.in @@ -0,0 +1,36 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +// +// test/include headers +// +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" + +// +// Header for tests in ./tests directory +// +// Note: CMake adds ./tests as an include dir for these tests. +// +#include "test-algorithm-util-reduce.hpp" + + +// +// Cartesian product of types used in parameterized tests +// +using @ALG_BACKEND@@UTIL_ALG@ReduceTypes = + Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@ReduceReducers, + @ALG_BACKEND@ResourceList, + ReduceValTypeList, + ReduceMaxNList@ALG_SIZE@ > >::Types; + +// +// Instantiate parameterized test +// +INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@, + ReduceUnitTest, + @ALG_BACKEND@@UTIL_ALG@ReduceTypes ); diff --git a/test/unit/algorithm/test-algorithm-util-sort.cpp.in b/test/unit/algorithm/test-algorithm-util-sort.cpp.in index 7dbb0dcd93..0555a9e9f0 100644 --- a/test/unit/algorithm/test-algorithm-util-sort.cpp.in +++ b/test/unit/algorithm/test-algorithm-util-sort.cpp.in @@ -22,15 +22,15 @@ // // Cartesian product of types used in parameterized tests // -using @SORT_BACKEND@@UTIL_SORT@SortTypes = - Test< camp::cartesian_product<@SORT_BACKEND@@UTIL_SORT@SortSorters, - @SORT_BACKEND@ResourceList, +using @ALG_BACKEND@@UTIL_ALG@SortTypes = + Test< camp::cartesian_product<@ALG_BACKEND@@UTIL_ALG@SortSorters, + @ALG_BACKEND@ResourceList, SortKeyTypeList, - SortMaxNList@SORT_SIZE@ > >::Types; + SortMaxNList@ALG_SIZE@ > >::Types; // // Instantiate parameterized test // -INSTANTIATE_TYPED_TEST_SUITE_P( @SORT_BACKEND@, +INSTANTIATE_TYPED_TEST_SUITE_P( @ALG_BACKEND@, SortUnitTest, - @SORT_BACKEND@@UTIL_SORT@SortTypes ); + @ALG_BACKEND@@UTIL_ALG@SortTypes ); diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp new file mode 100644 index 0000000000..5277a07684 --- /dev/null +++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp @@ -0,0 +1,350 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing test infrastructure for reduce tests +/// + +#ifndef __TEST_ALGORITHM_REDUCE_UTILS_HPP__ +#define __TEST_ALGORITHM_REDUCE_UTILS_HPP__ + +#include "RAJA_test-base.hpp" +#include "RAJA_test-camp.hpp" +#include "RAJA_test-forall-data.hpp" +#include "type_helper.hpp" +#include "RAJA_unit-test-forone.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + + +// tag classes to differentiate reduce by attributes and apply correct testing +struct left_fold_reduce_tag { }; +struct unordered_reduce_tag { }; + +struct reduce_interface_tag { }; + +struct reduce_default_interface_tag { }; +struct reduce_init_interface_tag { }; +struct reduce_init_op_interface_tag { }; + + +// synchronize based on a RAJA execution policy +template < typename policy > +struct PolicySynchronize +{ + void synchronize() + { + // no synchronization needed + } +}; + +#if defined(RAJA_ENABLE_CUDA) +// partial specialization for cuda_exec +template < size_t BLOCK_SIZE, bool Async > +struct PolicySynchronize> +{ + void synchronize() + { + if (Async) { RAJA::synchronize(); } + } +}; +#endif + +#if defined(RAJA_ENABLE_HIP) +// partial specialization for hip_exec +template < size_t BLOCK_SIZE, bool Async > +struct PolicySynchronize> +{ + void synchronize() + { + if (Async) { RAJA::synchronize(); } + } +}; +#endif + + +template +struct ReduceData; + +template +struct ReduceData +{ + ValType* values = nullptr; + ValType* reduced_value = nullptr; + Res m_res; + + template < typename RandomGenerator > + ReduceData(size_t N, Res res, RandomGenerator gen_random) + : m_res(res) + { + if (N > 0) { + values = m_res.template allocate(N, camp::resources::MemoryAccess::Managed); + } + reduced_value = m_res.template allocate(1, camp::resources::MemoryAccess::Managed); + + for (size_t i = 0; i < N; i++) { + values[i] = gen_random(); + } + } + + void copy_data(size_t N) + { + if ( N == 0 ) return; + } + + Res resource() + { + return m_res; + } + + ReduceData(ReduceData const&) = delete; + ReduceData& operator=(ReduceData const&) = delete; + + ~ReduceData() + { + if (values != nullptr) { + m_res.deallocate(values, camp::resources::MemoryAccess::Managed); + m_res.deallocate(reduced_value, camp::resources::MemoryAccess::Managed); + } + } +}; + + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T, + BinaryOp, + Reducer reducer, reduce_interface_tag, reduce_default_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N)); + reducer.synchronize(); +} + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp, + Reducer reducer, reduce_interface_tag, reduce_init_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N), init); + reducer.synchronize(); +} + +template +void doReduce(ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + Reducer reducer, reduce_interface_tag, reduce_init_op_interface_tag) +{ + data.copy_data(N); + data.resource().wait(); + reducer(data.reduced_value, RAJA::make_span(data.values, N), init, op); + reducer.synchronize(); +} + + +template +::testing::AssertionResult testReduce( + const char* test_name, + const unsigned seed, + ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + TestReducer test_reducer, left_fold_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) +{ + doReduce(data, N, init, op, test_reducer, si, ci); + + T reduced_check_value = init; + for (RAJA::Index_type i = 0; i < N; i++) { + reduced_check_value = op(std::move(reduced_check_value), data.values[i]); + } + + if (reduced_check_value != *data.reduced_value) { + return ::testing::AssertionFailure() + << test_reducer.name() << " (left fold reduce) " << test_name + << " (with N " << N << " with seed " << seed << ")" + << " incorrect " << *data.reduced_value + << ", expected " << reduced_check_value; + } + + return ::testing::AssertionSuccess(); +} + +template +::testing::AssertionResult testReduce( + const char* test_name, + const unsigned seed, + ReduceData & data, + RAJA::Index_type N, + T init, + BinaryOp op, + TestReducer test_reducer, unordered_reduce_tag, reduce_interface_tag si, BinaryOpInterface ci) +{ + doReduce(data, N, init, op, test_reducer, si, ci); + + T reduced_check_value = init; + for (RAJA::Index_type i = 0; i < N; i++) { + reduced_check_value = op(std::move(reduced_check_value), data.values[i]); + } + + if (reduced_check_value != *data.reduced_value) { + return ::testing::AssertionFailure() + << test_reducer.name() << " (unordered reduce) " << test_name + << " (with N " << N << " with seed " << seed << ")" + << " incorrect " << *data.reduced_value + << ", expected " << reduced_check_value; + } + + return ::testing::AssertionSuccess(); +} + + +template +void testReducerInterfaces(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res) +{ + using reduce_category = typename Reducer::reduce_category ; + using interface_category = typename Reducer::reduce_interface ; + using no_init_operator = reduce_default_interface_tag; + using init_no_operator = reduce_init_interface_tag; + using init_operator = reduce_init_op_interface_tag; + + std::mt19937 rng(seed); + RAJA::Index_type N = std::uniform_int_distribution((MaxN+1)/2, MaxN)(rng); + std::uniform_int_distribution dist(-N, N); + + ReduceData data(N, res, [&](){ return dist(rng); }); + + ASSERT_TRUE(testReduce("default", seed, data, N, RAJA::operators::plus::identity(), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, no_init_operator{})); + ASSERT_TRUE(testReduce("init", seed, data, N, ValType(N), RAJA::operators::plus{}, + reducer, reduce_category{}, interface_category{}, init_no_operator{})); + ASSERT_TRUE(testReduce("minimum", seed, data, N, ValType(0), RAJA::operators::minimum{}, + reducer, reduce_category{}, interface_category{}, init_operator{})); + ASSERT_TRUE(testReduce("Maximum", seed, data, N, ValType(0), RAJA::operators::maximum{}, + reducer, reduce_category{}, interface_category{}, init_operator{})); +} + +template +void testReducer(unsigned seed, RAJA::Index_type MaxN, Reducer reducer, Res res) +{ + testReducerInterfaces(seed, 0, reducer, res); + for (RAJA::Index_type n = 1; n <= MaxN; n *= 10) { + testReducerInterfaces(seed, n, reducer, res); + } +} + +inline unsigned get_random_seed() +{ + static unsigned seed = std::random_device{}(); + return seed; +} + + +TYPED_TEST_SUITE_P(ReduceUnitTest); + +template < typename T > +class ReduceUnitTest : public ::testing::Test +{ }; + +TYPED_TEST_P(ReduceUnitTest, UnitReduce) +{ + using Reducer = typename camp::at>::type; + using ResType = typename camp::at>::type; + using ValType = typename camp::at>::type; + using MaxNType = typename camp::at>::type; + + unsigned seed = get_random_seed(); + RAJA::Index_type MaxN = MaxNType::value; + Reducer reducer{}; + ResType res = ResType::get_default(); + + testReducer(seed, MaxN, reducer, res); +} + +REGISTER_TYPED_TEST_SUITE_P(ReduceUnitTest, UnitReduce); + + +// +// Key types for reduce tests +// +using ReduceValTypeList = + camp::list< + RAJA::Index_type, + int, +#if defined(RAJA_TEST_EXHAUSTIVE) + unsigned, + long long, + unsigned long long, + float, +#endif + double + >; + +// Max test lengths for reduce tests +using ReduceMaxNListDefault = + camp::list< + camp::num<10000> + >; + +using ReduceMaxNListSmall = + camp::list< + camp::num<1000> + >; + +using ReduceMaxNListTiny = + camp::list< + camp::num<100> + >; + +#endif //__TEST_ALGORITHM_REDUCE_UTILS_HPP__ + diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp new file mode 100644 index 0000000000..f2cb0dda8d --- /dev/null +++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp @@ -0,0 +1,205 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC. +// +// Produced at the Lawrence Livermore National Laboratory +// +// LLNL-CODE-689114 +// +// All rights reserved. +// +// This file is part of RAJA. +// +// For details about use and distribution, please read RAJA/LICENSE. +// +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// Header file containing Reducer classes for util reduce tests +/// + +#ifndef __TEST_ALGORITHM_UTIL_REDUCE_HPP__ +#define __TEST_ALGORITHM_UTIL_REDUCE_HPP__ + +#include "test-algorithm-reduce-utils.hpp" + + +template < typename test_policy > +using ForoneSynchronize = PolicySynchronize>; + + +template < typename test_policy, typename platform = test_platform > +struct BinaryTreeReduce; + +template < typename test_policy, typename platform = test_platform > +struct Accumulate; + + +template < typename test_policy > +struct BinaryTreeReduce + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = reduce_interface_tag; + + const char* name() + { + return "RAJA::binary_tree_reduce"; + } + + template < typename T, typename... Args > + void operator()(T* reduced_value, Args&&... args) + { + *reduced_value = RAJA::binary_tree_reduce(std::forward(args)...); + } +}; + +template < typename test_policy > +struct Accumulate + : ForoneSynchronize +{ + using reduce_category = left_fold_reduce_tag; + using reduce_interface = reduce_interface_tag; + + const char* name() + { + return "RAJA::accumulate"; + } + + template < typename T, typename... Args > + void operator()(T* reduced_value, Args&&... args) + { + *reduced_value = RAJA::accumulate(std::forward(args)...); + } +}; + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + +template < typename test_policy > +struct BinaryTreeReduce + : ForoneSynchronize +{ + using reduce_category = unordered_reduce_tag; + using reduce_interface = reduce_interface_tag; + + std::string m_name; + + BinaryTreeReduce() + : m_name(std::string("RAJA::binary_tree_reduce<") + test_policy_info::name() + std::string(">")) + { } + + const char* name() + { + return m_name.c_str(); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c); + }); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c, init); + }); + } + + template < typename T, typename Container, typename BinaryOp > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init, BinaryOp op) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::binary_tree_reduce(c, init, op); + }); + } +}; + +template < typename test_policy > +struct Accumulate + : ForoneSynchronize +{ + using reduce_category = left_fold_reduce_tag; + using reduce_interface = reduce_interface_tag; + + std::string m_name; + + Accumulate() + : m_name(std::string("RAJA::accumulate<") + test_policy_info::name() + std::string(">")) + { } + + const char* name() + { + return m_name.c_str(); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c); + }); + } + + template < typename T, typename Container > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c, init); + }); + } + + template < typename T, typename Container, typename BinaryOp > + void operator()(T* reduced_value, Container&& c, RAJA::detail::ContainerVal init, BinaryOp op) + { + forone( [=] RAJA_DEVICE() { + *reduced_value = RAJA::accumulate(c, init, op); + }); + } +}; + +#endif + + +using SequentialBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using SequentialAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#if defined(RAJA_ENABLE_CUDA) + +using CudaBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using CudaAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#endif + +#if defined(RAJA_ENABLE_HIP) + +using HipBinaryTreeReduceReducers = + camp::list< + BinaryTreeReduce + >; + +using HipAccumulateReduceReducers = + camp::list< + Accumulate + >; + +#endif + +#endif //__TEST_ALGORITHM_UTIL_REDUCE_HPP__ + From b8cfadfaf3b22effb3cfda89c698d6d2be5912bc Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 18 Apr 2024 16:19:36 -0700 Subject: [PATCH 072/108] Use the higher accuracy reducer in cuda and hip reducers --- include/RAJA/policy/cuda/reduce.hpp | 6 +++++- include/RAJA/policy/hip/reduce.hpp | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index ccb310d2f9..698a259375 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -35,6 +35,7 @@ #include "RAJA/util/basic_mempool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" #include "RAJA/pattern/detail/reduce.hpp" #include "RAJA/pattern/reduce.hpp" @@ -995,12 +996,15 @@ class Reduce auto end = tally_or_val_ptr.list->end(); if (n != end) { tally_or_val_ptr.list->synchronize_resources(); + ::RAJA::detail::HighAccuracyReduce + reducer(std::move(val.value)); for (; n != end; ++n) { T(&values)[tally_slots] = *n; for (size_t r = 0; r < tally_slots; ++r) { - Combiner{}(val.value, values[r]); + reducer.combine(std::move(values[r])); } } + val.value = reducer.get_and_clear(); tally_or_val_ptr.list->free_list(); } return val.value; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 6579633957..9b8c625c22 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -35,6 +35,7 @@ #include "RAJA/util/basic_mempool.hpp" #include "RAJA/util/mutex.hpp" #include "RAJA/util/types.hpp" +#include "RAJA/util/reduce.hpp" #include "RAJA/pattern/detail/reduce.hpp" #include "RAJA/pattern/reduce.hpp" @@ -991,12 +992,15 @@ class Reduce auto end = tally_or_val_ptr.list->end(); if (n != end) { tally_or_val_ptr.list->synchronize_resources(); + ::RAJA::detail::HighAccuracyReduce + reducer(std::move(val.value)); for (; n != end; ++n) { T(&values)[tally_slots] = *n; for (size_t r = 0; r < tally_slots; ++r) { - Combiner{}(val.value, values[r]); + reducer.combine(std::move(values[r])); } } + val.value = reducer.get_and_clear(); tally_or_val_ptr.list->free_list(); } return val.value; From aae46e9fb3a22aa61a9dbe7d5d0ff6bfcf715bc9 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 10:31:48 -0700 Subject: [PATCH 073/108] Add some documentation for new cuda/hip reduction policies --- docs/sphinx/user_guide/feature/policies.rst | 56 +++++++++++++-------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index aad065cb16..9222af59c4 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -743,26 +743,42 @@ It is important to note the following constraints about RAJA reduction usage: The following table summarizes RAJA reduction policy types: -======================= ============= ========================================== -Reduction Policy Loop Policies Brief description - to Use With -======================= ============= ========================================== -seq_reduce seq_exec, Non-parallel (sequential) reduction. -omp_reduce any OpenMP OpenMP parallel reduction. - policy -omp_reduce_ordered any OpenMP OpenMP parallel reduction with result - policy guaranteed to be reproducible. -omp_target_reduce any OpenMP OpenMP parallel target offload reduction. - target policy -cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel - policy (device synchronization will occur when - reduction value is finalized). -cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use CUDA - policy atomic operations. -sycl_reduce any SYCL Reduction in a SYCL kernel (device - policy synchronization will occur when the - reduction value is finalized). -======================= ============= ========================================== +======================================== ============= ========================================== +Reduction Policy Loop Policies Brief description + to Use With +======================================== ============= ========================================== +seq_reduce seq_exec, Non-parallel (sequential) reduction. +omp_reduce any OpenMP OpenMP parallel reduction. + policy +omp_reduce_ordered any OpenMP OpenMP parallel reduction with result + policy guaranteed to be reproducible. +omp_target_reduce any OpenMP OpenMP parallel target offload reduction. + target policy +cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel + policy (device synchronization will occur when + reduction value is finalized). +cuda/hip_reduce\*atomic\* any CUDA/HIP Same as above, but reduction may use + policy atomic operations and initializes the + memory used for atomics on the device. + This works on all architectures but + incurs higher overheads. +cuda/hip_reduce\*atomic_host\* any CUDA/HIP Same as above, but reduction may use + policy atomic operations and initializes the + memory used for atomics on the host. + This works on recent architectures and + incurs lower overheads. +cuda/hip_reduce\*with_fences any CUDA/HIP Same as above, and reduction uses normal + policy memory accesses with device scope fences. + This works on all architectures but + incurs higher overheads. +cuda/hip_reduce\*avoid_fences any CUDA/HIP Same as above, and reduction uses special + policy memory accesses to allow it to avoid + device scope fences. This improves + performance on some architectures. +sycl_reduce any SYCL Reduction in a SYCL kernel (device + policy synchronization will occur when the + reduction value is finalized). +======================================== ============= ========================================== .. note:: RAJA reductions used with SIMD execution policies are not guaranteed to generate correct results. So they should not be used From 0f2790e94edb3c6a287008241f716e6231bf1b47 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 10:57:01 -0700 Subject: [PATCH 074/108] improve code docs of reduce policies --- include/RAJA/policy/cuda/policy.hpp | 31 ++++++++++++++++++++++++----- include/RAJA/policy/hip/policy.hpp | 31 ++++++++++++++++++++++++----- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 7bd895a2bc..c86822763b 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -280,20 +280,41 @@ using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning< replication, atomic_stride, maybe_atomic, avoid_fences, init_on_host> >; +// Policies for RAJA::Reduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results and falls back +// on a non-atomic policy when atomics can't be used with the given type. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions run to run. The memory used with +// atomics is initialized on the device which can be expensive on some HW. +// On some HW this is faster overall than the non-atomic policies. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host which is +// significantly cheaper on some HW. On some HW this is faster overall than +// the non-atomic and atomic policies. +// - *with_fences policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *avoid_fences policies use special (atomic) memory accesses that only cache +// in a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. using cuda_reduce_with_fences = cuda_reduce_base; - +/// using cuda_reduce_avoid_fences = cuda_reduce_base; - +/// using cuda_reduce_atomic_with_fences = cuda_reduce_base; - +/// using cuda_reduce_atomic_avoid_fences = cuda_reduce_base; - +/// using cuda_reduce_atomic_host_with_fences = cuda_reduce_base; - +/// using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base; +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way using cuda_reduce = cuda_reduce_with_fences; +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index f59bbac891..b63a8690ad 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -272,20 +272,41 @@ using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning< replication, atomic_stride, maybe_atomic, avoid_fences, init_on_host> >; +// Policies for RAJA::Reduce* objects with specific behaviors. +// - *atomic* policies may use atomics to combine partial results and falls back +// on a non-atomic policy when atomics can't be used with the given type. The +// use of atomics leads to order of operation differences which change the +// results of floating point sum reductions run to run. The memory used with +// atomics is initialized on the device which can be expensive on some HW. +// On some HW this is faster overall than the non-atomic policies. +// - *atomic_host* policies are similar to the atomic policies above. However +// the memory used with atomics is initialized on the host which is +// significantly cheaper on some HW. On some HW this is faster overall than +// the non-atomic and atomic policies. +// - *with_fences policies use normal memory accesses with device scope fences +// in the implementation. This works on all HW. +// - *avoid_fences policies use special (atomic) memory accesses that only cache +// in a cache shared by the whole device to avoid having to use +// device scope fences. This improves performance on some HW but +// is more difficult to code correctly. using hip_reduce_with_fences = hip_reduce_base; - +/// using hip_reduce_avoid_fences = hip_reduce_base; - +/// using hip_reduce_atomic_with_fences = hip_reduce_base; - +/// using hip_reduce_atomic_avoid_fences = hip_reduce_base; - +/// using hip_reduce_atomic_host_with_fences = hip_reduce_base; - +/// using hip_reduce_atomic_host_avoid_fences = hip_reduce_base; +// Policy for RAJA::Reduce* objects that gives the same answer every time when +// used in the same way using hip_reduce = hip_reduce_avoid_fences; +// Policy for RAJA::Reduce* objects that may use atomics and may not give the +// same answer every time when used in the same way using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences; From e9ef4c42007f21f5177f317ca14752b58642925e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 17:29:13 -0700 Subject: [PATCH 075/108] Use enums in cuda/hip reduce tuning policies --- include/RAJA/policy/cuda/policy.hpp | 71 +++++++++++++++++++++-------- include/RAJA/policy/cuda/reduce.hpp | 30 ++++++++---- include/RAJA/policy/hip/policy.hpp | 71 +++++++++++++++++++++-------- include/RAJA/policy/hip/reduce.hpp | 29 ++++++++---- 4 files changed, 145 insertions(+), 56 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index c86822763b..3b534348cf 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -159,15 +159,28 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer } }; -template < size_t t_replication, size_t t_atomic_stride, - bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host > + +enum struct reduce_algorithm : int +{ + finalize_last_block, + init_first_block_finalize_block_atomic, + init_host_finalize_block_atomic +}; + +enum struct block_communication_mode : int +{ + device_fence, + avoid_device_fence +}; + +template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, + size_t t_replication, size_t t_atomic_stride > struct ReduceTuning { + static constexpr reduce_algorithm algorithm = t_algorithm; + static constexpr block_communication_mode comm_mode = t_comm_mode; static constexpr size_t replication = t_replication; static constexpr size_t atomic_stride = t_atomic_stride; - static constexpr bool maybe_atomic = t_maybe_atomic; - static constexpr bool avoid_fences = t_avoid_fences; - static constexpr bool init_on_host = t_init_on_host; }; } // namespace cuda @@ -271,14 +284,13 @@ struct cuda_atomic_explicit{}; */ using cuda_atomic = cuda_atomic_explicit; -template < bool maybe_atomic, + +template < RAJA::cuda::reduce_algorithm algorithm, + RAJA::cuda::block_communication_mode comm_mode, size_t replication = named_usage::unspecified, - size_t atomic_stride = named_usage::unspecified, - bool init_on_host = false, - bool avoid_fences = false > -using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning< - replication, atomic_stride, - maybe_atomic, avoid_fences, init_on_host> >; + size_t atomic_stride = named_usage::unspecified > +using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning< + algorithm, comm_mode, replication, atomic_stride> >; // Policies for RAJA::Reduce* objects with specific behaviors. // - *atomic* policies may use atomics to combine partial results and falls back @@ -297,17 +309,35 @@ using cuda_reduce_base = cuda_reduce_policy< RAJA::cuda::ReduceTuning< // in a cache shared by the whole device to avoid having to use // device scope fences. This improves performance on some HW but // is more difficult to code correctly. -using cuda_reduce_with_fences = cuda_reduce_base; +using cuda_reduce_with_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::finalize_last_block, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_avoid_fences = cuda_reduce_base; +using cuda_reduce_avoid_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::finalize_last_block, + RAJA::cuda::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_with_fences = cuda_reduce_base; +using cuda_reduce_atomic_with_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_avoid_fences = cuda_reduce_base; +using cuda_reduce_atomic_avoid_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic, + RAJA::cuda::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_host_with_fences = cuda_reduce_base; +using cuda_reduce_atomic_host_with_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic, + RAJA::cuda::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_base; +using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic, + RAJA::cuda::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; // Policy for RAJA::Reduce* objects that gives the same answer every time when // used in the same way @@ -317,6 +347,11 @@ using cuda_reduce = cuda_reduce_with_fences; // same answer every time when used in the same way using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences; +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool maybe_atomic > +using cuda_reduce_base = std::conditional_t; + // Policy for RAJA::statement::Reduce that reduces threads in a block // down to threadIdx 0 diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 698a259375..59ea86308f 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -877,6 +877,7 @@ struct ReduceAtomic_Data } }; + //! Cuda Reduction entity -- generalize on reduction, and type template class Reduce @@ -890,19 +891,28 @@ class Reduce ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); - static constexpr bool use_atomic = tuning::maybe_atomic && - RAJA::reduce::cuda::cuda_atomic_available::value; - - using Accessor = std::conditional_t; + std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), + impl::AccessorWithFences, + void>>; + + static constexpr bool atomic_policy = + (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) || + (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic); + static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available::value; //! cuda reduction data storage class and folding algorithm - using reduce_data_type = std::conditional_t, - cuda::ReduceAtomic_Data>, - cuda::Reduce_Data>; + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) || + (atomic_policy && !atomic_available), + cuda::Reduce_Data, + std::conditional_t, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic), + cuda::ReduceAtomicInitialized_Data, + void>>, + void>>; static constexpr size_t tally_slots = reduce_data_type::tally_slots; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index b63a8690ad..1501a6dc35 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -154,15 +154,28 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer } }; -template < size_t t_replication, size_t t_atomic_stride, - bool t_maybe_atomic, bool t_avoid_fences, bool t_init_on_host > + +enum struct reduce_algorithm : int +{ + finalize_last_block, + init_first_block_finalize_block_atomic, + init_host_finalize_block_atomic +}; + +enum struct block_communication_mode : int +{ + device_fence, + avoid_device_fence +}; + +template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, + size_t t_replication, size_t t_atomic_stride > struct ReduceTuning { + static constexpr reduce_algorithm algorithm = t_algorithm; + static constexpr block_communication_mode comm_mode = t_comm_mode; static constexpr size_t replication = t_replication; static constexpr size_t atomic_stride = t_atomic_stride; - static constexpr bool maybe_atomic = t_maybe_atomic; - static constexpr bool avoid_fences = t_avoid_fences; - static constexpr bool init_on_host = t_init_on_host; }; } // namespace hip @@ -263,14 +276,13 @@ struct hip_atomic_explicit{}; */ using hip_atomic = hip_atomic_explicit; -template < bool maybe_atomic, + +template < RAJA::hip::reduce_algorithm algorithm, + RAJA::hip::block_communication_mode comm_mode, size_t replication = named_usage::unspecified, - size_t atomic_stride = named_usage::unspecified, - bool init_on_host = false, - bool avoid_fences = false > -using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning< - replication, atomic_stride, - maybe_atomic, avoid_fences, init_on_host> >; + size_t atomic_stride = named_usage::unspecified > +using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning< + algorithm, comm_mode, replication, atomic_stride> >; // Policies for RAJA::Reduce* objects with specific behaviors. // - *atomic* policies may use atomics to combine partial results and falls back @@ -289,17 +301,35 @@ using hip_reduce_base = hip_reduce_policy< RAJA::hip::ReduceTuning< // in a cache shared by the whole device to avoid having to use // device scope fences. This improves performance on some HW but // is more difficult to code correctly. -using hip_reduce_with_fences = hip_reduce_base; +using hip_reduce_with_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::finalize_last_block, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_avoid_fences = hip_reduce_base; +using hip_reduce_avoid_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::finalize_last_block, + RAJA::hip::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_with_fences = hip_reduce_base; +using hip_reduce_atomic_with_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_avoid_fences = hip_reduce_base; +using hip_reduce_atomic_avoid_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic, + RAJA::hip::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_host_with_fences = hip_reduce_base; +using hip_reduce_atomic_host_with_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic, + RAJA::hip::block_communication_mode::device_fence, + named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_host_avoid_fences = hip_reduce_base; +using hip_reduce_atomic_host_avoid_fences = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic, + RAJA::hip::block_communication_mode::avoid_device_fence, + named_usage::unspecified, named_usage::unspecified>; // Policy for RAJA::Reduce* objects that gives the same answer every time when // used in the same way @@ -309,6 +339,11 @@ using hip_reduce = hip_reduce_avoid_fences; // same answer every time when used in the same way using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences; +// Policy for RAJA::Reduce* objects that lets you select the default atomic or +// non-atomic policy with a bool +template < bool maybe_atomic > +using hip_reduce_base = std::conditional_t; + // Policy for RAJA::statement::Reduce that reduces threads in a block // down to threadIdx 0 diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 9b8c625c22..c8793d5102 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -886,19 +886,28 @@ class Reduce ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); - static constexpr bool use_atomic = tuning::maybe_atomic && - RAJA::reduce::hip::hip_atomic_available::value; - - using Accessor = std::conditional_t; + std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), + impl::AccessorWithFences, + void>>; + + static constexpr bool atomic_policy = + (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) || + (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic); + static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available::value; //! hip reduction data storage class and folding algorithm - using reduce_data_type = std::conditional_t, - hip::ReduceAtomic_Data>, - hip::Reduce_Data>; + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) || + (atomic_policy && !atomic_available), + hip::Reduce_Data, + std::conditional_t, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic), + hip::ReduceAtomicInitialized_Data, + void>>, + void>>; static constexpr size_t tally_slots = reduce_data_type::tally_slots; From c946dd332a967c014884a0597d1c906e423b5319 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 20:35:58 -0700 Subject: [PATCH 076/108] update cuda/hip simple allocator comments --- include/RAJA/policy/cuda/MemUtils_CUDA.hpp | 8 ++++---- include/RAJA/policy/hip/MemUtils_HIP.hpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp index 5a66aff20e..43d927acab 100644 --- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp +++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp @@ -61,7 +61,7 @@ struct PinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFreeHost(ptr)); @@ -80,7 +80,7 @@ struct DeviceAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFree(ptr)); @@ -103,7 +103,7 @@ struct DeviceZeroedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFree(ptr)); @@ -127,7 +127,7 @@ struct DevicePinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { cudaErrchk(cudaFree(ptr)); diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp index 63a8c9911c..84c6d1fa38 100644 --- a/include/RAJA/policy/hip/MemUtils_HIP.hpp +++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp @@ -63,7 +63,7 @@ struct PinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipHostFree(ptr)); @@ -82,7 +82,7 @@ struct DeviceAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipFree(ptr)); @@ -105,7 +105,7 @@ struct DeviceZeroedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipFree(ptr)); @@ -124,7 +124,7 @@ struct DevicePinnedAllocator { return ptr; } - // returns true on success, false on failure + // returns true on success, throws a run time error exception on failure bool free(void* ptr) { hipErrchk(hipFree(ptr)); From 8fbbf48dc3a38466759c318ed5c396db052dc9e2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Fri, 19 Apr 2024 21:22:16 -0700 Subject: [PATCH 077/108] Rename Accessor types --- include/RAJA/policy/cuda/intrinsics.hpp | 35 ++++++++++++++++++------- include/RAJA/policy/cuda/reduce.hpp | 4 +-- include/RAJA/policy/hip/intrinsics.hpp | 35 ++++++++++++++++++------- include/RAJA/policy/hip/reduce.hpp | 4 +-- 4 files changed, 54 insertions(+), 24 deletions(-) diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp index 053d7ab50e..c908046cac 100644 --- a/include/RAJA/policy/cuda/intrinsics.hpp +++ b/include/RAJA/policy/cuda/intrinsics.hpp @@ -46,9 +46,18 @@ namespace impl { /*! - * \brief Abstracts access to memory using normal memory accesses. + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This uses device scope fences to ensure ordering and to flush local + * caches so that memory accesses become visible to the whole device. + * \Note This class uses normal memory accesses that are cached in local caches + * so device scope fences are required to make memory accesses visible + * to the whole device. */ -struct AccessorWithFences : RAJA::detail::DefaultAccessor +struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor { static RAJA_DEVICE RAJA_INLINE void fence_acquire() { @@ -64,18 +73,24 @@ struct AccessorWithFences : RAJA::detail::DefaultAccessor /*! ****************************************************************************** * - * \brief Abstracts access to memory using atomic memory accesses. + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. * - * \Note Memory access through this class does not guarantee safe access to a - * value that is accessed concurrently by other threads as it may split - * memory operations into multiple atomic instructions. - * \Note Fences used through this class only guarantee ordering, they do not - * guarantee visiblity of non-atomic memory operations as it may not - * actually flush the cache. + * \Note This may use block scope fences to ensure ordering and avoid flushing + * local caches so special memory accesses are used to ensure visibility + * to the whole device. + * \Note This class uses device scope atomic memory accesses to bypass local + * caches so memory accesses are visible to the whole device without + * device scope fences. + * \Note A memory access may be split into multiple memory accesses, so + * even though atomic instructions are used concurrent accesses between + * different threads are not thread safe. * ****************************************************************************** */ -struct AccessorAvoidingFences +struct AccessorDeviceScopeUseSharedCache { // cuda has 32 and 64 bit atomics static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 59ea86308f..37e266b94a 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -892,9 +892,9 @@ class Reduce : 1); using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence), - impl::AccessorAvoidingFences, + impl::AccessorDeviceScopeUseSharedCache, std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), - impl::AccessorWithFences, + impl::AccessorDeviceScopeUseLocalCache, void>>; static constexpr bool atomic_policy = diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp index fe3ac0f35d..c36f609f90 100644 --- a/include/RAJA/policy/hip/intrinsics.hpp +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -46,9 +46,18 @@ namespace impl { /*! - * \brief Abstracts access to memory using normal memory accesses. + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. + * + * \Note This uses device scope fences to ensure ordering and to flush local + * caches so that memory accesses become visible to the whole device. + * \Note This class uses normal memory accesses that are cached in local caches + * so device scope fences are required to make memory accesses visible + * to the whole device. */ -struct AccessorWithFences : RAJA::detail::DefaultAccessor +struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor { static RAJA_DEVICE RAJA_INLINE void fence_acquire() { @@ -64,18 +73,24 @@ struct AccessorWithFences : RAJA::detail::DefaultAccessor /*! ****************************************************************************** * - * \brief Abstracts access to memory using atomic memory accesses. + * \brief Abstracts access to memory when coordinating between threads at + * device scope. The fences provided here are to be used with relaxed + * atomics in order to guarantee memory ordering and visibility of the + * accesses done through this class. * - * \Note Memory access through this class does not guarantee safe access to a - * value that is accessed concurrently by other threads as it may split - * memory operations into multiple atomic instructions. - * \Note Fences used through this class only guarantee ordering, they do not - * guarantee visiblity of non-atomic memory operations as it may not - * actually flush the cache. + * \Note This may use block scope fences to ensure ordering and avoid flushing + * local caches so special memory accesses are used to ensure visibility + * to the whole device. + * \Note This class uses device scope atomic memory accesses to bypass local + * caches so memory accesses are visible to the whole device without + * device scope fences. + * \Note A memory access may be split into multiple memory accesses, so + * even though atomic instructions are used concurrent accesses between + * different threads are not thread safe. * ****************************************************************************** */ -struct AccessorAvoidingFences +struct AccessorDeviceScopeUseSharedCache { // hip has 32 and 64 bit atomics static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index c8793d5102..140f01eabf 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -887,9 +887,9 @@ class Reduce : 1); using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence), - impl::AccessorAvoidingFences, + impl::AccessorDeviceScopeUseSharedCache, std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), - impl::AccessorWithFences, + impl::AccessorDeviceScopeUseLocalCache, void>>; static constexpr bool atomic_policy = From 7f519cbab869e3902b591bcb23714698b92c8094 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 29 Apr 2024 09:44:58 -0700 Subject: [PATCH 078/108] Increase lassen ci time --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index e6da7cecbf..cee458cd60 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -56,7 +56,7 @@ variables: # Lassen and Butte use a different job scheduler (spectrum lsf) that does not # allow pre-allocation the same way slurm does. # Arguments for job level allocation - LASSEN_JOB_ALLOC: "1 -W 30 -q pci" + LASSEN_JOB_ALLOC: "1 -W 40 -q pci" # Project specific variants for lassen PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70" # Project specific deps for lassen From 2f9ce110a7b5ae141586139da2b11cca94bca2b2 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 29 Apr 2024 13:24:49 -0700 Subject: [PATCH 079/108] Don't specify T in atomic reduce helpers --- include/RAJA/policy/cuda/reduce.hpp | 10 +++++----- include/RAJA/policy/hip/reduce.hpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index 37e266b94a..ccdfe43e63 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -69,7 +69,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v); } }; @@ -77,7 +77,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v); } }; @@ -85,7 +85,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v); } }; @@ -93,7 +93,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v); } }; @@ -101,7 +101,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v); + RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v); } }; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 140f01eabf..2258340b52 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -63,7 +63,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v); + RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v); } }; @@ -71,7 +71,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMin(RAJA::hip_atomic{}, &val, v); + RAJA::atomicMin(RAJA::hip_atomic{}, &val, v); } }; @@ -79,7 +79,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicMax(RAJA::hip_atomic{}, &val, v); + RAJA::atomicMax(RAJA::hip_atomic{}, &val, v); } }; @@ -87,7 +87,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v); + RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v); } }; @@ -95,7 +95,7 @@ template struct atomic> { RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v) { - RAJA::atomicOr(RAJA::hip_atomic{}, &val, v); + RAJA::atomicOr(RAJA::hip_atomic{}, &val, v); } }; From 780ecdc2f03a2d02e74a5fd8f21388fa542a64a1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Mon, 29 Apr 2024 16:46:23 -0700 Subject: [PATCH 080/108] Apply suggestions from code review Co-authored-by: Robert Chen --- include/RAJA/util/SoAPtr.hpp | 2 +- test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp | 2 +- test/unit/algorithm/tests/test-algorithm-util-reduce.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp index 00a2fce111..47802d8f0a 100644 --- a/include/RAJA/util/SoAPtr.hpp +++ b/include/RAJA/util/SoAPtr.hpp @@ -46,7 +46,7 @@ template - friend class SoAPtr; // fiend other instantiations of this class + friend class SoAPtr; // friend other instantiations of this class public: using value_type = T; diff --git a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp index 5277a07684..4e3f9fb795 100644 --- a/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp +++ b/test/unit/algorithm/tests/test-algorithm-reduce-utils.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC. +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC. // // Produced at the Lawrence Livermore National Laboratory // diff --git a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp index f2cb0dda8d..062e0f9b91 100644 --- a/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp +++ b/test/unit/algorithm/tests/test-algorithm-util-reduce.hpp @@ -1,5 +1,5 @@ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC. +// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC. // // Produced at the Lawrence Livermore National Laboratory // From 2bb0b8f5b17f3f7137679b4343256bc1cd5d395e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 30 Apr 2024 17:36:35 -0700 Subject: [PATCH 081/108] Rename rec_for_reduce polices to reduce_default --- include/RAJA/policy/cuda/policy.hpp | 18 +++++++++--------- include/RAJA/policy/hip/policy.hpp | 10 +++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index 3b534348cf..fb1a2f90ae 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1062,7 +1062,7 @@ using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConc using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer; -using CudaRecForReduceConcretizer = CudaMaxOccupancyConcretizer; +using CudaReduceDefaultConcretizer = CudaMaxOccupancyConcretizer; using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer; @@ -1189,24 +1189,24 @@ using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit< Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_rec_for_reduce_explicit = policy::cuda::cuda_exec_explicit< +using cuda_exec_reduce_default_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaRecForReduceConcretizer, BLOCKS_PER_SM, Async>; + CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>; template -using cuda_exec_rec_for_reduce_explicit_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_reduce_default_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaRecForReduceConcretizer, BLOCKS_PER_SM, true>; + CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>; template -using cuda_exec_rec_for_reduce = policy::cuda::cuda_exec_explicit< +using cuda_exec_reduce_default = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; + CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template -using cuda_exec_rec_for_reduce_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_reduce_default_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, - CudaRecForReduceConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; + CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; // policies usable with WorkGroup diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 1501a6dc35..d1985ce667 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -1058,7 +1058,7 @@ using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcre using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer; -using HipRecForReduceConcretizer = HipFractionOffsetOccupancyConcretizer, 0>; +using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer, 0>; using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer; @@ -1125,14 +1125,14 @@ using hip_exec_occ_custom_async = policy::hip::hip_exec< Concretizer, true>; template -using hip_exec_rec_for_reduce = policy::hip::hip_exec< +using hip_exec_reduce_default = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipRecForReduceConcretizer, Async>; + HipReduceDefaultConcretizer, Async>; template -using hip_exec_rec_for_reduce_async = policy::hip::hip_exec< +using hip_exec_reduce_default_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, - HipRecForReduceConcretizer, true>; + HipReduceDefaultConcretizer, true>; // policies usable with WorkGroup using policy::hip::hip_work; From ebaeaf49f5eae03054222a553081252eaa1bef64 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 30 Apr 2024 17:40:37 -0700 Subject: [PATCH 082/108] Rename reducer policies --- include/RAJA/policy/cuda/intrinsics.hpp | 4 +- include/RAJA/policy/cuda/policy.hpp | 58 +++++++++++----------- include/RAJA/policy/cuda/reduce.hpp | 64 ++++++++++++------------- include/RAJA/policy/hip/intrinsics.hpp | 4 +- include/RAJA/policy/hip/policy.hpp | 58 +++++++++++----------- include/RAJA/policy/hip/reduce.hpp | 64 ++++++++++++------------- test/include/RAJA_test-reducepol.hpp | 24 +++++----- 7 files changed, 138 insertions(+), 138 deletions(-) diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp index c908046cac..b0d2ea7cf1 100644 --- a/include/RAJA/policy/cuda/intrinsics.hpp +++ b/include/RAJA/policy/cuda/intrinsics.hpp @@ -57,7 +57,7 @@ namespace impl * so device scope fences are required to make memory accesses visible * to the whole device. */ -struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor +struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor { static RAJA_DEVICE RAJA_INLINE void fence_acquire() { @@ -90,7 +90,7 @@ struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor * ****************************************************************************** */ -struct AccessorDeviceScopeUseSharedCache +struct AccessorDeviceScopeUseBlockFence { // cuda has 32 and 64 bit atomics static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index fb1a2f90ae..a2aff97373 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -162,15 +162,15 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer enum struct reduce_algorithm : int { - finalize_last_block, - init_first_block_finalize_block_atomic, - init_host_finalize_block_atomic + combine_last_block, + init_device_combine_atomic_block, + init_host_combine_atomic_block }; enum struct block_communication_mode : int { device_fence, - avoid_device_fence + block_fence }; template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, @@ -303,49 +303,49 @@ using cuda_reduce_tuning = cuda_reduce_policy< RAJA::cuda::ReduceTuning< // the memory used with atomics is initialized on the host which is // significantly cheaper on some HW. On some HW this is faster overall than // the non-atomic and atomic policies. -// - *with_fences policies use normal memory accesses with device scope fences +// - *device_fence policies use normal memory accesses with device scope fences // in the implementation. This works on all HW. -// - *avoid_fences policies use special (atomic) memory accesses that only cache +// - *block_fence policies use special (atomic) memory accesses that only cache // in a cache shared by the whole device to avoid having to use // device scope fences. This improves performance on some HW but // is more difficult to code correctly. -using cuda_reduce_with_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::finalize_last_block, +using cuda_reduce_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::combine_last_block, RAJA::cuda::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_avoid_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::finalize_last_block, - RAJA::cuda::block_communication_mode::avoid_device_fence, +using cuda_reduce_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::combine_last_block, + RAJA::cuda::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_with_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic, +using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, RAJA::cuda::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_avoid_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_first_block_finalize_block_atomic, - RAJA::cuda::block_communication_mode::avoid_device_fence, +using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_device_combine_atomic_block, + RAJA::cuda::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_host_with_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic, +using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, RAJA::cuda::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using cuda_reduce_atomic_host_avoid_fences = cuda_reduce_tuning< - RAJA::cuda::reduce_algorithm::init_host_finalize_block_atomic, - RAJA::cuda::block_communication_mode::avoid_device_fence, +using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning< + RAJA::cuda::reduce_algorithm::init_host_combine_atomic_block, + RAJA::cuda::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; // Policy for RAJA::Reduce* objects that gives the same answer every time when // used in the same way -using cuda_reduce = cuda_reduce_with_fences; +using cuda_reduce = cuda_reduce_device_fence; // Policy for RAJA::Reduce* objects that may use atomics and may not give the // same answer every time when used in the same way -using cuda_reduce_atomic = cuda_reduce_atomic_host_with_fences; +using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence; // Policy for RAJA::Reduce* objects that lets you select the default atomic or // non-atomic policy with a bool @@ -1229,12 +1229,12 @@ using policy::cuda::cuda_atomic; using policy::cuda::cuda_atomic_explicit; // policies usable with reducers -using policy::cuda::cuda_reduce_with_fences; -using policy::cuda::cuda_reduce_avoid_fences; -using policy::cuda::cuda_reduce_atomic_with_fences; -using policy::cuda::cuda_reduce_atomic_avoid_fences; -using policy::cuda::cuda_reduce_atomic_host_with_fences; -using policy::cuda::cuda_reduce_atomic_host_avoid_fences; +using policy::cuda::cuda_reduce_device_fence; +using policy::cuda::cuda_reduce_block_fence; +using policy::cuda::cuda_reduce_atomic_device_init_device_fence; +using policy::cuda::cuda_reduce_atomic_device_init_block_fence; +using policy::cuda::cuda_reduce_atomic_host_init_device_fence; +using policy::cuda::cuda_reduce_atomic_host_init_block_fence; using policy::cuda::cuda_reduce_base; using policy::cuda::cuda_reduce; using policy::cuda::cuda_reduce_atomic; diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp index ccdfe43e63..516b02383c 100644 --- a/include/RAJA/policy/cuda/reduce.hpp +++ b/include/RAJA/policy/cuda/reduce.hpp @@ -127,7 +127,7 @@ namespace impl template -RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, +RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val, T identity, TempIterator in_device_mem, unsigned int* device_count) @@ -328,7 +328,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red // returns true if put reduced value in val template -RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val, T identity, T* device_mem, unsigned int* device_count) @@ -391,7 +391,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, //! reduce values in block into thread 0 and atomically combines into device_mem template -RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val, +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val, T identity, T* device_mem) { @@ -604,7 +604,7 @@ class PinnedTally //! pointer template -struct Reduce_Data +struct ReduceLastBlock_Data { using tally_mempool_type = pinned_mempool_type; using data_mempool_type = device_mempool_type; @@ -618,14 +618,14 @@ struct Reduce_Data RAJA::detail::SoAPtr device; bool owns_device_pointer; - Reduce_Data() : Reduce_Data(T(), T()){} + ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){} /*! \brief create from a default value and offload information * * allocates PinnedTally to hold device values */ - Reduce_Data(T initValue, T identity_) + ReduceLastBlock_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -635,7 +635,7 @@ struct Reduce_Data } RAJA_HOST_DEVICE - Reduce_Data(const Reduce_Data& other) + ReduceLastBlock_Data(const ReduceLastBlock_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -644,7 +644,7 @@ struct Reduce_Data { } - Reduce_Data& operator=(const Reduce_Data&) = default; + ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -662,7 +662,7 @@ struct Reduce_Data { T temp = value; - size_t replicationId = impl::grid_reduce< + size_t replicationId = impl::grid_reduce_last_block< Combiner, Accessor, replication, atomic_stride>( temp, identity, device, device_count); if (replicationId != replication) { @@ -705,7 +705,7 @@ struct Reduce_Data //! Reduction data for Cuda Offload -- stores value, host pointer template -struct ReduceAtomicInitialized_Data +struct ReduceAtomicHostInit_Data { using tally_mempool_type = device_pinned_mempool_type; @@ -716,9 +716,9 @@ struct ReduceAtomicInitialized_Data bool is_setup; bool owns_device_pointer; - ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){}; + ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}; - ReduceAtomicInitialized_Data(T initValue, T identity_) + ReduceAtomicHostInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, is_setup{false}, @@ -727,7 +727,7 @@ struct ReduceAtomicInitialized_Data } RAJA_HOST_DEVICE - ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other) + ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other) : value{other.identity}, identity{other.identity}, is_setup{other.is_setup}, @@ -735,7 +735,7 @@ struct ReduceAtomicInitialized_Data { } - ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default; + ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -753,7 +753,7 @@ struct ReduceAtomicInitialized_Data { T temp = value; - impl::grid_reduce_atomic_initialized( temp, identity, output); } @@ -786,7 +786,7 @@ struct ReduceAtomicInitialized_Data //! Reduction data for Cuda Offload -- stores value, host pointer template -struct ReduceAtomic_Data +struct ReduceAtomicDeviceInit_Data { using tally_mempool_type = pinned_mempool_type; using data_mempool_type = device_mempool_type; @@ -800,9 +800,9 @@ struct ReduceAtomic_Data T* device; bool owns_device_pointer; - ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){}; + ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}; - ReduceAtomic_Data(T initValue, T identity_) + ReduceAtomicDeviceInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -812,7 +812,7 @@ struct ReduceAtomic_Data } RAJA_HOST_DEVICE - ReduceAtomic_Data(const ReduceAtomic_Data& other) + ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -821,7 +821,7 @@ struct ReduceAtomic_Data { } - ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default; + ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -839,7 +839,7 @@ struct ReduceAtomic_Data { T temp = value; - size_t replicationId = impl::grid_reduce_atomic< + size_t replicationId = impl::grid_reduce_atomic_device_init< Combiner, Accessor, replication, atomic_stride>( temp, identity, device, device_count); if (replicationId != replication) { @@ -891,26 +891,26 @@ class Reduce ? RAJA_DIVIDE_CEILING_INT(policy::cuda::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); - using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence), - impl::AccessorDeviceScopeUseSharedCache, + using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), + impl::AccessorDeviceScopeUseBlockFence, std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), - impl::AccessorDeviceScopeUseLocalCache, + impl::AccessorDeviceScopeUseDeviceFence, void>>; static constexpr bool atomic_policy = - (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) || - (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic); + (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) || + (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block); static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available::value; //! cuda reduction data storage class and folding algorithm - using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) || + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) || (atomic_policy && !atomic_available), - cuda::Reduce_Data, + cuda::ReduceLastBlock_Data, std::conditional_t, - std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic), - cuda::ReduceAtomicInitialized_Data, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block), + cuda::ReduceAtomicDeviceInit_Data, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block), + cuda::ReduceAtomicHostInit_Data, void>>, void>>; diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp index c36f609f90..354e5d7278 100644 --- a/include/RAJA/policy/hip/intrinsics.hpp +++ b/include/RAJA/policy/hip/intrinsics.hpp @@ -57,7 +57,7 @@ namespace impl * so device scope fences are required to make memory accesses visible * to the whole device. */ -struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor +struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor { static RAJA_DEVICE RAJA_INLINE void fence_acquire() { @@ -90,7 +90,7 @@ struct AccessorDeviceScopeUseLocalCache : RAJA::detail::DefaultAccessor * ****************************************************************************** */ -struct AccessorDeviceScopeUseSharedCache +struct AccessorDeviceScopeUseBlockFence { // hip has 32 and 64 bit atomics static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int); diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index d1985ce667..df0995f59c 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -157,15 +157,15 @@ struct AvoidDeviceMaxThreadOccupancyConcretizer enum struct reduce_algorithm : int { - finalize_last_block, - init_first_block_finalize_block_atomic, - init_host_finalize_block_atomic + combine_last_block, + init_device_combine_atomic_block, + init_host_combine_atomic_block }; enum struct block_communication_mode : int { device_fence, - avoid_device_fence + block_fence }; template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode, @@ -295,49 +295,49 @@ using hip_reduce_tuning = hip_reduce_policy< RAJA::hip::ReduceTuning< // the memory used with atomics is initialized on the host which is // significantly cheaper on some HW. On some HW this is faster overall than // the non-atomic and atomic policies. -// - *with_fences policies use normal memory accesses with device scope fences +// - *device_fence policies use normal memory accesses with device scope fences // in the implementation. This works on all HW. -// - *avoid_fences policies use special (atomic) memory accesses that only cache +// - *block_fence policies use special (atomic) memory accesses that only cache // in a cache shared by the whole device to avoid having to use // device scope fences. This improves performance on some HW but // is more difficult to code correctly. -using hip_reduce_with_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::finalize_last_block, +using hip_reduce_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::combine_last_block, RAJA::hip::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_avoid_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::finalize_last_block, - RAJA::hip::block_communication_mode::avoid_device_fence, +using hip_reduce_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::combine_last_block, + RAJA::hip::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_with_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic, +using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, RAJA::hip::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_avoid_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_first_block_finalize_block_atomic, - RAJA::hip::block_communication_mode::avoid_device_fence, +using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_device_combine_atomic_block, + RAJA::hip::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_host_with_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic, +using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, RAJA::hip::block_communication_mode::device_fence, named_usage::unspecified, named_usage::unspecified>; /// -using hip_reduce_atomic_host_avoid_fences = hip_reduce_tuning< - RAJA::hip::reduce_algorithm::init_host_finalize_block_atomic, - RAJA::hip::block_communication_mode::avoid_device_fence, +using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning< + RAJA::hip::reduce_algorithm::init_host_combine_atomic_block, + RAJA::hip::block_communication_mode::block_fence, named_usage::unspecified, named_usage::unspecified>; // Policy for RAJA::Reduce* objects that gives the same answer every time when // used in the same way -using hip_reduce = hip_reduce_avoid_fences; +using hip_reduce = hip_reduce_block_fence; // Policy for RAJA::Reduce* objects that may use atomics and may not give the // same answer every time when used in the same way -using hip_reduce_atomic = hip_reduce_atomic_host_avoid_fences; +using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence; // Policy for RAJA::Reduce* objects that lets you select the default atomic or // non-atomic policy with a bool @@ -1147,12 +1147,12 @@ using policy::hip::hip_atomic; using policy::hip::hip_atomic_explicit; // policies usable with reducers -using policy::hip::hip_reduce_with_fences; -using policy::hip::hip_reduce_avoid_fences; -using policy::hip::hip_reduce_atomic_with_fences; -using policy::hip::hip_reduce_atomic_avoid_fences; -using policy::hip::hip_reduce_atomic_host_with_fences; -using policy::hip::hip_reduce_atomic_host_avoid_fences; +using policy::hip::hip_reduce_device_fence; +using policy::hip::hip_reduce_block_fence; +using policy::hip::hip_reduce_atomic_device_init_device_fence; +using policy::hip::hip_reduce_atomic_device_init_block_fence; +using policy::hip::hip_reduce_atomic_host_init_device_fence; +using policy::hip::hip_reduce_atomic_host_init_block_fence; using policy::hip::hip_reduce_base; using policy::hip::hip_reduce; using policy::hip::hip_reduce_atomic; diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp index 2258340b52..2dbaf9f7e5 100644 --- a/include/RAJA/policy/hip/reduce.hpp +++ b/include/RAJA/policy/hip/reduce.hpp @@ -121,7 +121,7 @@ namespace impl template -RAJA_DEVICE RAJA_INLINE int grid_reduce(T& val, +RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val, T identity, TempIterator in_device_mem, unsigned int* device_count) @@ -323,7 +323,7 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce(RAJA::expt::detail::Reducer& red template -RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, +RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val, T identity, T* device_mem, unsigned int* device_count) @@ -386,7 +386,7 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic(T& val, //! reduce values in block into thread 0 and atomically combines into device_mem template -RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_initialized(T& val, +RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val, T identity, T* device_mem) { @@ -600,7 +600,7 @@ class PinnedTally //! pointer template -struct Reduce_Data +struct ReduceLastBlock_Data { using tally_mempool_type = pinned_mempool_type; using data_mempool_type = device_mempool_type; @@ -614,14 +614,14 @@ struct Reduce_Data RAJA::detail::SoAPtr device; bool own_device_ptr; - Reduce_Data() : Reduce_Data(T(), T()){}; + ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}; /*! \brief create from a default value and offload information * * allocates PinnedTally to hold device values */ - Reduce_Data(T initValue, T identity_) + ReduceLastBlock_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -631,7 +631,7 @@ struct Reduce_Data } RAJA_HOST_DEVICE - Reduce_Data(const Reduce_Data& other) + ReduceLastBlock_Data(const ReduceLastBlock_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -640,7 +640,7 @@ struct Reduce_Data { } - Reduce_Data& operator=(const Reduce_Data&) = default; + ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -657,7 +657,7 @@ struct Reduce_Data void grid_reduce(T* output) { T temp = value; - size_t replicationId = impl::grid_reduce< + size_t replicationId = impl::grid_reduce_last_block< Combiner, Accessor, replication, atomic_stride>( temp, identity, device, device_count); if (replicationId != replication) { @@ -701,7 +701,7 @@ struct Reduce_Data //! Reduction data for Hip Offload -- stores value, host pointer template -struct ReduceAtomicInitialized_Data +struct ReduceAtomicHostInit_Data { using tally_mempool_type = device_pinned_mempool_type; @@ -712,9 +712,9 @@ struct ReduceAtomicInitialized_Data bool is_setup; bool own_device_ptr; - ReduceAtomicInitialized_Data() : ReduceAtomicInitialized_Data(T(), T()){} + ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){} - ReduceAtomicInitialized_Data(T initValue, T identity_) + ReduceAtomicHostInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, is_setup{false}, @@ -723,7 +723,7 @@ struct ReduceAtomicInitialized_Data } RAJA_HOST_DEVICE - ReduceAtomicInitialized_Data(const ReduceAtomicInitialized_Data& other) + ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other) : value{other.identity}, identity{other.identity}, is_setup{other.is_setup}, @@ -731,7 +731,7 @@ struct ReduceAtomicInitialized_Data { } - ReduceAtomicInitialized_Data& operator=(const ReduceAtomicInitialized_Data&) = default; + ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -749,7 +749,7 @@ struct ReduceAtomicInitialized_Data { T temp = value; - impl::grid_reduce_atomic_initialized( + impl::grid_reduce_atomic_host_init( temp, identity, output); } @@ -781,7 +781,7 @@ struct ReduceAtomicInitialized_Data //! Reduction data for Hip Offload -- stores value, host pointer template -struct ReduceAtomic_Data +struct ReduceAtomicDeviceInit_Data { using tally_mempool_type = pinned_mempool_type; using data_mempool_type = device_mempool_type; @@ -795,9 +795,9 @@ struct ReduceAtomic_Data T* device; bool own_device_ptr; - ReduceAtomic_Data() : ReduceAtomic_Data(T(), T()){} + ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){} - ReduceAtomic_Data(T initValue, T identity_) + ReduceAtomicDeviceInit_Data(T initValue, T identity_) : value{initValue}, identity{identity_}, device_count{nullptr}, @@ -807,7 +807,7 @@ struct ReduceAtomic_Data } RAJA_HOST_DEVICE - ReduceAtomic_Data(const ReduceAtomic_Data& other) + ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other) : value{other.identity}, identity{other.identity}, device_count{other.device_count}, @@ -816,7 +816,7 @@ struct ReduceAtomic_Data { } - ReduceAtomic_Data& operator=(const ReduceAtomic_Data&) = default; + ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default; //! initialize output to identity to ensure never read // uninitialized memory @@ -834,7 +834,7 @@ struct ReduceAtomic_Data { T temp = value; - size_t replicationId = impl::grid_reduce_atomic< + size_t replicationId = impl::grid_reduce_atomic_device_init< Combiner, Accessor, replication, atomic_stride>( temp, identity, device, device_count); if (replicationId != replication) { @@ -886,26 +886,26 @@ class Reduce ? RAJA_DIVIDE_CEILING_INT(policy::hip::ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T)) : 1); - using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::avoid_device_fence), - impl::AccessorDeviceScopeUseSharedCache, + using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence), + impl::AccessorDeviceScopeUseBlockFence, std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence), - impl::AccessorDeviceScopeUseLocalCache, + impl::AccessorDeviceScopeUseDeviceFence, void>>; static constexpr bool atomic_policy = - (tuning::algorithm == reduce_algorithm::init_first_block_finalize_block_atomic) || - (tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic); + (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) || + (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block); static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available::value; //! hip reduction data storage class and folding algorithm - using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::finalize_last_block) || + using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) || (atomic_policy && !atomic_available), - hip::Reduce_Data, + hip::ReduceLastBlock_Data, std::conditional_t, - std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_finalize_block_atomic), - hip::ReduceAtomicInitialized_Data, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block), + hip::ReduceAtomicDeviceInit_Data, + std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block), + hip::ReduceAtomicHostInit_Data, void>>, void>>; diff --git a/test/include/RAJA_test-reducepol.hpp b/test/include/RAJA_test-reducepol.hpp index cd97a686ca..e9e075b287 100644 --- a/test/include/RAJA_test-reducepol.hpp +++ b/test/include/RAJA_test-reducepol.hpp @@ -34,21 +34,21 @@ using OpenMPTargetReducePols = #endif #if defined(RAJA_ENABLE_CUDA) -using CudaReducePols = camp::list< RAJA::cuda_reduce_with_fences, - RAJA::cuda_reduce_avoid_fences, - RAJA::cuda_reduce_atomic_with_fences, - RAJA::cuda_reduce_atomic_avoid_fences, - RAJA::cuda_reduce_atomic_host_with_fences, - RAJA::cuda_reduce_atomic_host_avoid_fences >; +using CudaReducePols = camp::list< RAJA::cuda_reduce_device_fence, + RAJA::cuda_reduce_block_fence, + RAJA::cuda_reduce_atomic_device_init_device_fence, + RAJA::cuda_reduce_atomic_device_init_block_fence, + RAJA::cuda_reduce_atomic_host_init_device_fence, + RAJA::cuda_reduce_atomic_host_init_block_fence >; #endif #if defined(RAJA_ENABLE_HIP) -using HipReducePols = camp::list< RAJA::hip_reduce_with_fences, - RAJA::hip_reduce_avoid_fences, - RAJA::hip_reduce_atomic_with_fences, - RAJA::hip_reduce_atomic_avoid_fences, - RAJA::hip_reduce_atomic_host_with_fences, - RAJA::hip_reduce_atomic_host_avoid_fences >; +using HipReducePols = camp::list< RAJA::hip_reduce_device_fence, + RAJA::hip_reduce_block_fence, + RAJA::hip_reduce_atomic_device_init_device_fence, + RAJA::hip_reduce_atomic_device_init_block_fence, + RAJA::hip_reduce_atomic_host_init_device_fence, + RAJA::hip_reduce_atomic_host_init_block_fence >; #endif #if defined(RAJA_ENABLE_SYCL) From 70523d2725a3ad77dfc32e4a128cdd249a7ada5e Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Tue, 30 Apr 2024 17:51:50 -0700 Subject: [PATCH 083/108] Update docs for renamings --- .../sphinx/user_guide/cook_book/reduction.rst | 6 +-- docs/sphinx/user_guide/feature/policies.rst | 44 ++++++++++--------- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index e8925ee019..64fb172df7 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -52,15 +52,15 @@ RAJA uses policy types to specify how things are implemented. The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied. For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The -``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with +``RAJA::cuda_exec_reduce_default<256>`` runs the loop as a CUDA GPU kernel with 256 threads per block and other CUDA kernel launch parameters, like the number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; // using exec_policy = RAJA::omp_parallel_for_exec; // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; - // using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>; - // using exec_policy = RAJA::hip_exec_rec_for_reduce<256>; + // using exec_policy = RAJA::cuda_exec_reduce_default<256>; + // using exec_policy = RAJA::hip_exec_reduce_default<256>; // using exec_policy = RAJA::sycl_exec<256>; The reduction policy specifies how the reduction is done and must match the diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 9222af59c4..2b6c3574b8 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -280,15 +280,15 @@ policies have the prefix ``hip_``. Concretizer> policy but the grid size is determined by the concretizer. - cuda/hip_exec_rec_for_reduce forall The cuda/hip exec policy - that is recommended for - use with reducers. In general using - the occupancy calculator policies - are better but exactly how much - occupancy to use differs by platform - so this policy provides a simple way - to get what works best for that platform - without having to know the details. + cuda/hip_exec_reduce_default forall The cuda/hip exec policy that is + recommended for use with reducers. + In general using the occupancy + calculator policies are better for + reducers but exactly how much occupancy + to use differs by platform so this policy + provides a simple way to get what works + best for a platform without having to + know the details. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed @@ -758,22 +758,26 @@ cuda/hip_reduce any CUDA/HIP Parallel reduction in a C policy (device synchronization will occur when reduction value is finalized). cuda/hip_reduce\*atomic\* any CUDA/HIP Same as above, but reduction may use - policy atomic operations and initializes the - memory used for atomics on the device. - This works on all architectures but - incurs higher overheads. -cuda/hip_reduce\*atomic_host\* any CUDA/HIP Same as above, but reduction may use - policy atomic operations and initializes the + policy atomic operations leading to run to run + variability in the results. +cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the memory used for atomics on the host. This works on recent architectures and incurs lower overheads. -cuda/hip_reduce\*with_fences any CUDA/HIP Same as above, and reduction uses normal - policy memory accesses with device scope fences. +cuda/hip_reduce\*device_init\* any CUDA/HIP Same as above, but initializes the + memory used for atomics on the device. This works on all architectures but incurs higher overheads. -cuda/hip_reduce\*avoid_fences any CUDA/HIP Same as above, and reduction uses special - policy memory accesses to allow it to avoid - device scope fences. This improves +cuda/hip_reduce\*device_fence any CUDA/HIP Same as above, and reduction uses normal + policy memory accesses that are not visible across + the whole device and device scope fences + to ensure visibility and ordering. + This works on all architectures but + incurs higher overheads on some architectures. +cuda/hip_reduce\*block_fence any CUDA/HIP Same as above, and reduction uses special + policy memory accesses to a level of cache shared + visible to the whole device and block scope + fences to ensure ordering. This improves performance on some architectures. sycl_reduce any SYCL Reduction in a SYCL kernel (device policy synchronization will occur when the From 1b79d608a0f58580b155da2136e174d528f916bb Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 1 May 2024 15:07:45 -0700 Subject: [PATCH 084/108] Use conditional_t in AsIntegerArray --- include/RAJA/util/types.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp index 8441f75522..7e331ef00e 100644 --- a/include/RAJA/util/types.hpp +++ b/include/RAJA/util/types.hpp @@ -899,31 +899,31 @@ struct AsIntegerArray { static_assert(min_integer_type_size <= max_integer_type_size, "incompatible min and max integer type size"); - using integer_type = typename std::conditional< + using integer_type = std::conditional_t< ((alignof(T) >= alignof(unsigned long long) && sizeof(unsigned long long) <= max_integer_type_size) || sizeof(unsigned long) < min_integer_type_size), unsigned long long, - typename std::conditional< + std::conditional_t< ((alignof(T) >= alignof(unsigned long) && sizeof(unsigned long) <= max_integer_type_size) || sizeof(unsigned int) < min_integer_type_size), unsigned long, - typename std::conditional< + std::conditional_t< ((alignof(T) >= alignof(unsigned int) && sizeof(unsigned int) <= max_integer_type_size) || sizeof(unsigned short) < min_integer_type_size), unsigned int, - typename std::conditional< + std::conditional_t< ((alignof(T) >= alignof(unsigned short) && sizeof(unsigned short) <= max_integer_type_size) || sizeof(unsigned char) < min_integer_type_size), unsigned short, - typename std::conditional< + std::conditional_t< ((alignof(T) >= alignof(unsigned char) && sizeof(unsigned char) <= max_integer_type_size)), unsigned char, - void>::type>::type>::type>::type>::type; + void>>>>>; static_assert(!std::is_same::value, "could not find a compatible integer type"); static_assert(sizeof(integer_type) >= min_integer_type_size, From f6e62bc800fa8d137b04e7b2dbad7e14dc41ed74 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 1 May 2024 15:32:48 -0700 Subject: [PATCH 085/108] Rename to exec_with_reduce --- docs/sphinx/user_guide/cook_book/reduction.rst | 6 +++--- docs/sphinx/user_guide/feature/policies.rst | 2 +- include/RAJA/policy/cuda/policy.hpp | 8 ++++---- include/RAJA/policy/hip/policy.hpp | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index 64fb172df7..a750ee149c 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -52,15 +52,15 @@ RAJA uses policy types to specify how things are implemented. The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied. For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The -``RAJA::cuda_exec_reduce_default<256>`` runs the loop as a CUDA GPU kernel with +``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with 256 threads per block and other CUDA kernel launch parameters, like the number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; // using exec_policy = RAJA::omp_parallel_for_exec; // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; - // using exec_policy = RAJA::cuda_exec_reduce_default<256>; - // using exec_policy = RAJA::hip_exec_reduce_default<256>; + // using exec_policy = RAJA::cuda_exec_with_reduce<256>; + // using exec_policy = RAJA::hip_exec_with_reduce<256>; // using exec_policy = RAJA::sycl_exec<256>; The reduction policy specifies how the reduction is done and must match the diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 2b6c3574b8..11d5aa5f05 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -280,7 +280,7 @@ policies have the prefix ``hip_``. Concretizer> policy but the grid size is determined by the concretizer. - cuda/hip_exec_reduce_default forall The cuda/hip exec policy that is + cuda/hip_exec_with_reduce forall The cuda/hip exec policy that is recommended for use with reducers. In general using the occupancy calculator policies are better for diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index a2aff97373..ed6456e0fc 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1189,22 +1189,22 @@ using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit< Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; template -using cuda_exec_reduce_default_explicit = policy::cuda::cuda_exec_explicit< +using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>; template -using cuda_exec_reduce_default_explicit_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>; template -using cuda_exec_reduce_default = policy::cuda::cuda_exec_explicit< +using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>; template -using cuda_exec_reduce_default_async = policy::cuda::cuda_exec_explicit< +using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index df0995f59c..3712eccbb9 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -1125,12 +1125,12 @@ using hip_exec_occ_custom_async = policy::hip::hip_exec< Concretizer, true>; template -using hip_exec_reduce_default = policy::hip::hip_exec< +using hip_exec_with_reduce = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipReduceDefaultConcretizer, Async>; template -using hip_exec_reduce_default_async = policy::hip::hip_exec< +using hip_exec_with_reduce_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipReduceDefaultConcretizer, true>; From 3143dda1509b184fdb4ae2baf15952d0022b2034 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 1 May 2024 16:47:32 -0700 Subject: [PATCH 086/108] Add cuda/hip_exec_base This lets you choose between cuda/hip_exec and cuda/hip_exec_with_reduce similarly to how cuda/hip_reduce_base lets you choose betwen cuda/hip_reduce and cuda/hip_reduce_atomic --- .../sphinx/user_guide/cook_book/reduction.rst | 8 +++--- docs/sphinx/user_guide/feature/policies.rst | 26 ++++++++++++------- include/RAJA/policy/cuda/policy.hpp | 20 ++++++++++++++ include/RAJA/policy/hip/policy.hpp | 10 +++++++ 4 files changed, 50 insertions(+), 14 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index a750ee149c..b025f8a549 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -59,8 +59,8 @@ number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; // using exec_policy = RAJA::omp_parallel_for_exec; // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; - // using exec_policy = RAJA::cuda_exec_with_reduce<256>; - // using exec_policy = RAJA::hip_exec_with_reduce<256>; + // using exec_policy = RAJA::cuda_exec_with_reduce<256>; // or RAJA::cuda_exec_base; + // using exec_policy = RAJA::hip_exec_with_reduce<256>; // or RAJA::hip_exec_base; // using exec_policy = RAJA::sycl_exec<256>; The reduction policy specifies how the reduction is done and must match the @@ -72,8 +72,8 @@ data type, and can only be used with cuda execution policies. Similarly for othe using reduce_policy = RAJA::seq_reduce; // using reduce_policy = RAJA::omp_reduce; // using reduce_policy = RAJA::omp_target_reduce; - // using reduce_policy = RAJA::cuda_reduce_atomic; - // using reduce_policy = RAJA::hip_reduce_atomic; + // using reduce_policy = RAJA::cuda_reduce_atomic; // or RAJA::cuda_reduce_base + // using reduce_policy = RAJA::hip_reduce_atomic; // or RAJA::hip_reduce_base // using reduce_policy = RAJA::sycl_reduce; diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 11d5aa5f05..71291073ce 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -247,6 +247,18 @@ policies have the prefix ``hip_``. Note that the thread-block size must be provided, there is no default. + cuda/hip_exec_with_reduce forall The cuda/hip exec policy that is + recommended for use with reducers. + In general using the occupancy + calculator policies are better for + reducers but exactly how much occupancy + to use differs by platform so this policy + provides a simple way to get what works + best for a platform without having to + know the details. + cuda/hip_exec_base cuda/hip_exec_with_reduce policies based on + the with_reduce boolean. cuda/hip_exec_grid forall, Execute loop iterations mapped to global threads via grid striding with multiple @@ -280,15 +292,6 @@ policies have the prefix ``hip_``. Concretizer> policy but the grid size is determined by the concretizer. - cuda/hip_exec_with_reduce forall The cuda/hip exec policy that is - recommended for use with reducers. - In general using the occupancy - calculator policies are better for - reducers but exactly how much occupancy - to use differs by platform so this policy - provides a simple way to get what works - best for a platform without having to - know the details. cuda/hip_launch_t launch Launches a device kernel, any code expressed within the lambda is executed @@ -757,9 +760,12 @@ omp_target_reduce any OpenMP OpenMP parallel target of cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel policy (device synchronization will occur when reduction value is finalized). -cuda/hip_reduce\*atomic\* any CUDA/HIP Same as above, but reduction may use +cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use policy atomic operations leading to run to run variability in the results. +cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and + policy cuda/hip_reduce_atomic policies based on + the maybe_atomic boolean. cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the memory used for atomics on the host. This works on recent architectures and diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index ed6456e0fc..d99a8c6c79 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1208,6 +1208,26 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit< iteration_mapping::StridedLoop, cuda::global_x, CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>; +template +using cuda_exec_base_explicit = std::conditional_t, + cuda_exec>; + +template +using cuda_exec_base_explicit_async = std::conditional_t, + cuda_exec>; + +template +using cuda_exec_base = std::conditional_t, + cuda_exec>; + +template +using cuda_exec_base_async = std::conditional_t, + cuda_exec>; + // policies usable with WorkGroup template diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 3712eccbb9..5c9841aa8c 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -1134,6 +1134,16 @@ using hip_exec_with_reduce_async = policy::hip::hip_exec< iteration_mapping::StridedLoop, hip::global_x, HipReduceDefaultConcretizer, true>; +template +using hip_exec_base = std::conditional_t, + hip_exec>; + +template +using hip_exec_base_async = std::conditional_t, + hip_exec>; + // policies usable with WorkGroup using policy::hip::hip_work; From 7df3f554287f3417eb030337b65bcd66430c9577 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Wed, 1 May 2024 17:03:09 -0700 Subject: [PATCH 087/108] Add extra bit to cookbook for base policies --- .../sphinx/user_guide/cook_book/reduction.rst | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index b025f8a549..d1190e222a 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -59,8 +59,8 @@ number of blocks, optimized for performance with reducers.:: using exec_policy = RAJA::seq_exec; // using exec_policy = RAJA::omp_parallel_for_exec; // using exec_policy = RAJA::omp_target_parallel_for_exec<256>; - // using exec_policy = RAJA::cuda_exec_with_reduce<256>; // or RAJA::cuda_exec_base; - // using exec_policy = RAJA::hip_exec_with_reduce<256>; // or RAJA::hip_exec_base; + // using exec_policy = RAJA::cuda_exec_with_reduce<256>; + // using exec_policy = RAJA::hip_exec_with_reduce<256>; // using exec_policy = RAJA::sycl_exec<256>; The reduction policy specifies how the reduction is done and must match the @@ -72,8 +72,8 @@ data type, and can only be used with cuda execution policies. Similarly for othe using reduce_policy = RAJA::seq_reduce; // using reduce_policy = RAJA::omp_reduce; // using reduce_policy = RAJA::omp_target_reduce; - // using reduce_policy = RAJA::cuda_reduce_atomic; // or RAJA::cuda_reduce_base - // using reduce_policy = RAJA::hip_reduce_atomic; // or RAJA::hip_reduce_base + // using reduce_policy = RAJA::cuda_reduce_atomic; + // using reduce_policy = RAJA::hip_reduce_atomic; // using reduce_policy = RAJA::sycl_reduce; @@ -91,3 +91,20 @@ Here a simple sum reduction is performed using RAJA:: The results of these operations will yield the following values: * vsum.get() == 1000 + + +Another option for the execution policy when using the cuda or hip backends are +the base policies which have a boolean parameter to choose between the general +use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.:: + + // static constexpr bool with_reducers = ...; + // using exec_policy = RAJA::cuda_exec_base; + // using exec_policy = RAJA::hip_exec_base; + +Another option for the reduction policy when using the cuda or hip backends are +the base policies which have a boolean parameter to choose between the atomic +``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.:: + + // static constexpr bool maybe_atomic = ...; + // using reduce_policy = RAJA::cuda_reduce_base; + // using reduce_policy = RAJA::hip_reduce_base; From a7d0b1bfe69a8620096ad21b63061bfc2a548ea8 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 10:15:40 -0700 Subject: [PATCH 088/108] Use with_atomic and with_reduce more consistently --- docs/sphinx/user_guide/cook_book/reduction.rst | 12 ++++++------ docs/sphinx/user_guide/feature/policies.rst | 4 ++-- include/RAJA/policy/cuda/policy.hpp | 4 ++-- include/RAJA/policy/hip/policy.hpp | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index d1190e222a..3ee2b479f2 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -97,14 +97,14 @@ Another option for the execution policy when using the cuda or hip backends are the base policies which have a boolean parameter to choose between the general use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.:: - // static constexpr bool with_reducers = ...; - // using exec_policy = RAJA::cuda_exec_base; - // using exec_policy = RAJA::hip_exec_base; + // static constexpr bool with_reduce = ...; + // using exec_policy = RAJA::cuda_exec_base; + // using exec_policy = RAJA::hip_exec_base; Another option for the reduction policy when using the cuda or hip backends are the base policies which have a boolean parameter to choose between the atomic ``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.:: - // static constexpr bool maybe_atomic = ...; - // using reduce_policy = RAJA::cuda_reduce_base; - // using reduce_policy = RAJA::hip_reduce_base; + // static constexpr bool with_atomic = ...; + // using reduce_policy = RAJA::cuda_reduce_base; + // using reduce_policy = RAJA::hip_reduce_base; diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 71291073ce..ec35367cee 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -763,9 +763,9 @@ cuda/hip_reduce any CUDA/HIP Parallel reduction in a C cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use policy atomic operations leading to run to run variability in the results. -cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and +cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and policy cuda/hip_reduce_atomic policies based on - the maybe_atomic boolean. + the with_atomic boolean. cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the memory used for atomics on the host. This works on recent architectures and diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index d99a8c6c79..ae510715ff 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -349,8 +349,8 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence; // Policy for RAJA::Reduce* objects that lets you select the default atomic or // non-atomic policy with a bool -template < bool maybe_atomic > -using cuda_reduce_base = std::conditional_t; +template < bool with_atomic > +using cuda_reduce_base = std::conditional_t; // Policy for RAJA::statement::Reduce that reduces threads in a block diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 5c9841aa8c..7c965a3c54 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -341,8 +341,8 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence; // Policy for RAJA::Reduce* objects that lets you select the default atomic or // non-atomic policy with a bool -template < bool maybe_atomic > -using hip_reduce_base = std::conditional_t; +template < bool with_atomic > +using hip_reduce_base = std::conditional_t; // Policy for RAJA::statement::Reduce that reduces threads in a block From 55ea1d353a6a4bd6840052ceb450303cf03bea82 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 10:20:13 -0700 Subject: [PATCH 089/108] Improve formatting of final values in reduce cookbook --- docs/sphinx/user_guide/cook_book/reduction.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/cook_book/reduction.rst b/docs/sphinx/user_guide/cook_book/reduction.rst index 3ee2b479f2..73843ebb40 100644 --- a/docs/sphinx/user_guide/cook_book/reduction.rst +++ b/docs/sphinx/user_guide/cook_book/reduction.rst @@ -46,7 +46,7 @@ Here a simple sum reduction is performed in a for loop:: The results of these operations will yield the following values: - * vsum == 1000 + * ``vsum == 1000`` RAJA uses policy types to specify how things are implemented. @@ -90,7 +90,7 @@ Here a simple sum reduction is performed using RAJA:: The results of these operations will yield the following values: - * vsum.get() == 1000 + * ``vsum.get() == 1000`` Another option for the execution policy when using the cuda or hip backends are From 9793ed9fb3809740c4e76bf647eb87c87b98d73b Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 10:30:08 -0700 Subject: [PATCH 090/108] fix spacing --- docs/sphinx/user_guide/feature/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index ec35367cee..85d3ef475d 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -763,7 +763,7 @@ cuda/hip_reduce any CUDA/HIP Parallel reduction in a C cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use policy atomic operations leading to run to run variability in the results. -cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and +cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and policy cuda/hip_reduce_atomic policies based on the with_atomic boolean. cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the From f46adf9700d8f03b794471664cfd376ca8426e32 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 12:55:54 -0700 Subject: [PATCH 091/108] fix base exec policies --- include/RAJA/policy/cuda/policy.hpp | 12 ++++++------ include/RAJA/policy/hip/policy.hpp | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp index ae510715ff..84cd8a301c 100644 --- a/include/RAJA/policy/cuda/policy.hpp +++ b/include/RAJA/policy/cuda/policy.hpp @@ -1210,13 +1210,13 @@ using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit< template using cuda_exec_base_explicit = std::conditional_t, - cuda_exec>; + cuda_exec_with_reduce_explicit, + cuda_exec_explicit>; template using cuda_exec_base_explicit_async = std::conditional_t, - cuda_exec>; + cuda_exec_with_reduce_explicit_async, + cuda_exec_explicit_async>; template using cuda_exec_base = std::conditional_t using cuda_exec_base_async = std::conditional_t, - cuda_exec>; + cuda_exec_with_reduce_async, + cuda_exec_async>; // policies usable with WorkGroup diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp index 7c965a3c54..c359a68de0 100644 --- a/include/RAJA/policy/hip/policy.hpp +++ b/include/RAJA/policy/hip/policy.hpp @@ -1141,8 +1141,8 @@ using hip_exec_base = std::conditional_t using hip_exec_base_async = std::conditional_t, - hip_exec>; + hip_exec_with_reduce_async, + hip_exec_async>; // policies usable with WorkGroup using policy::hip::hip_work; From f987c3948f7951606ee400a068062b4be59740b1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 12:58:56 -0700 Subject: [PATCH 092/108] Try to fix tables in docs --- docs/sphinx/user_guide/feature/policies.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 85d3ef475d..d418163c11 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -257,7 +257,7 @@ policies have the prefix ``hip_``. best for a platform without having to know the details. cuda/hip_exec_base cuda/hip_exec_with_reduce policies based on + BLOCK_SIZE> cuda/hip_exec_with_reduce policies based on the with_reduce boolean. cuda/hip_exec_grid forall, Execute loop iterations mapped to global threads via @@ -285,11 +285,11 @@ policies have the prefix ``hip_``. of the kernel for performance reasons. cuda/hip_exec_occ_fraction> of the maximum occupancy + RAJA::Fraction> of the maximum occupancy of the kernel. cuda/hip_exec_occ_custom policy but the grid size + Concretizer> policy but the grid size is determined by the concretizer. cuda/hip_launch_t launch Launches a device kernel, @@ -458,8 +458,8 @@ Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer Avoids using the max occupan occupancy of the device. Cuda/HipFractionOffsetOccupancyConcretizer< Uses a fraction and offset to choose an - Fraction, occupancy based on the max occupancy - BLOCKS_PER_SM_OFFSET> Using the following formula: +Fraction, occupancy based on the max occupancy +BLOCKS_PER_SM_OFFSET> Using the following formula: (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * sm_per_device From b4d4dce03b0c7072390f132c23f43fb3fe3b7c2a Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 13:39:17 -0700 Subject: [PATCH 093/108] convert gpu exec policy table into grid table --- docs/sphinx/user_guide/feature/policies.rst | 433 +++++++++++--------- 1 file changed, 238 insertions(+), 195 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index d418163c11..50301084ef 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -236,201 +236,244 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical. The only difference is that CUDA policies have the prefix ``cuda_`` and HIP policies have the prefix ``hip_``. - ========================================= ============= ======================================= - CUDA/HIP Execution Policies Works with Brief description - ========================================= ============= ======================================= - cuda/hip_exec forall, Execute loop iterations - scan, directly mapped to global threads - sort in a GPU kernel launched - with given thread-block - size and unbounded grid size. - Note that the thread-block - size must be provided, - there is no default. - cuda/hip_exec_with_reduce forall The cuda/hip exec policy that is - recommended for use with reducers. - In general using the occupancy - calculator policies are better for - reducers but exactly how much occupancy - to use differs by platform so this policy - provides a simple way to get what works - best for a platform without having to - know the details. - cuda/hip_exec_base cuda/hip_exec_with_reduce policies based on - the with_reduce boolean. - cuda/hip_exec_grid forall, Execute loop iterations - mapped to global threads via - grid striding with multiple - iterations per global thread - in a GPU kernel launched - with given thread-block - size and grid size. - Note that the thread-block - size and grid size must be - provided, there is no default. - cuda/hip_exec_occ_max forall Execute loop iterations - mapped to global threads via - grid striding with multiple - iterations per global thread - in a GPU kernel launched - with given thread-block - size and grid size bounded - by the maximum occupancy of - the kernel. - cuda/hip_exec_occ_calc forall Similar to the occ_max - policy but may use less - than the maximum occupancy - determined by the occupancy calculator - of the kernel for performance - reasons. - cuda/hip_exec_occ_fraction> of the maximum occupancy - of the kernel. - cuda/hip_exec_occ_custom policy but the grid size - is determined by the - concretizer. - cuda/hip_launch_t launch Launches a device kernel, - any code expressed within - the lambda is executed - on the device. - cuda/hip_thread_x_direct kernel (For) Map loop iterates - launch (loop) directly to GPU threads - in x-dimension, one - iterate per thread - (see note below about - limitations) - cuda/hip_thread_y_direct kernel (For) Same as above, but map - launch (loop) to threads in y-dim - cuda/hip_thread_z_direct kernel (For) Same as above, but map - launch (loop) to threads in z-dim - cuda/hip_thread_x_loop kernel (For) Similar to - launch (loop) thread-x-direct - policy, but use a - block-stride loop which - doesn't limit number of - loop iterates - cuda/hip_thread_y_loop kernel (For) Same as above, but for - launch (loop) threads in y-dimension - cuda/hip_thread_z_loop kernel (For) Same as above, but for - launch (loop) threads in z-dimension - cuda/hip_thread_syncable_loop kernel (For) Similar to thread-loop - launch (loop) policy, but safe to use - with Cuda/HipSyncThreads - cuda/hip_thread_size_x_direct kernel (For) Same as thread_x_direct - launch (loop) policy above but with - a compile time number of - threads - cuda/hip_thread_size_y_direct kernel (For) Same as above, but map - launch (loop) to threads in y-dim - cuda/hip_thread_size_z_direct kernel (For) Same as above, but map - launch (loop) to threads in z-dim - cuda/hip_flatten_threads_{xyz}_direct launch (loop) Reshapes threads in a - multi-dimensional thread - team into one-dimension, - accepts any permutation - of dimensions - cuda/hip_block_x_direct kernel (For) Map loop iterates - launch (loop) directly to GPU thread - blocks in x-dimension, - one iterate per block - cuda/hip_block_y_direct kernel (For) Same as above, but map - launch (loop) to blocks in y-dimension - cuda/hip_block_z_direct kernel (For) Same as above, but map - launch (loop) to blocks in z-dimension - cuda/hip_block_x_loop kernel (For) Similar to - launch (loop) block-x-direct policy, - but use a grid-stride - loop. - cuda/hip_block_y_loop kernel (For) Same as above, but use - launch (loop) blocks in y-dimension - cuda/hip_block_z_loop kernel (For) Same as above, but use - launch (loop) blocks in z-dimension - cuda/hip_block_size_x_direct kernel (For) Same as block_x_direct - launch (loop) policy above but with - a compile time number of - blocks - cuda/hip_block_size_y_direct kernel (For) Same as above, but map - launch (loop) to blocks in y-dim - cuda/hip_block_size_z_direct kernel (For) Same as above, but map - launch (loop) to blocks in z-dim - cuda/hip_global_x_direct kernel (For) Creates a unique thread - launch (loop) id for each thread on - x-dimension of the grid. - Same as computing - threadIdx.x + - threadDim.x * blockIdx.x. - cuda/hip_global_y_direct kernel (For) Same as above, but uses - launch (loop) globals in y-dimension. - cuda/hip_global_z_direct kernel (For) Same as above, but uses - launch (loop) globals in z-dimension. - cuda/hip_global_x_loop kernel (For) Similar to - launch (loop) global-x-direct policy, - but use a grid-stride - loop. - cuda/hip_global_y_loop kernel (For) Same as above, but use - launch (loop) globals in y-dimension - cuda/hip_global_z_loop kernel (For) Same as above, but use - launch (loop) globals in z-dimension - cuda/hip_global_size_x_direct kernel (For) Same as global_x_direct - launch (loop) policy above but with - a compile time block - size - cuda/hip_global_size_y_direct kernel (For) Same as above, but map - launch (loop) to globals in y-dim - cuda/hip_global_size_z_direct kernel (For) Same as above, but map - launch (loop) to globals in z-dim - cuda/hip_warp_direct kernel (For) Map work to threads - in a warp directly. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can be - created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_loop kernel (For) Policy to map work to - threads in a warp using - a warp-stride loop. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can be - created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_masked_direct> kernel (For) Policy to map work - directly to threads in a - warp using a bit mask. - Cannot be used in - conjunction with - cuda/hip_thread_x_* - policies. - Multiple warps can - be created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_warp_masked_loop> kernel (For) Policy to map work to - threads in a warp using - a bit mask and a - warp-stride loop. Cannot - be used in conjunction - with cuda/hip_thread_x_* - policies. Multiple warps - can be created by using - cuda/hip_thread_y/z_* - policies. - cuda/hip_block_reduce kernel Perform a reduction - (Reduce) across a single GPU - thread block. - cuda/hip_warp_reduce kernel Perform a reduction - (Reduce) across a single GPU - thread warp. - ========================================= ============= ======================================= ++-----------------------------------------+---------------+---------------------------------------+ +| CUDA/HIP Execution Policies | Works with | Brief description | ++=========================================+===============+=======================================+ +| cuda/hip_exec | forall, | Execute loop iterations | +| | scan, | directly mapped to global threads | +| | sort | in a GPU kernel launched | +| | | with given thread-block | +| | | size and unbounded grid size. | +| | | Note that the thread-block | +| | | size must be provided, | +| | | there is no default. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_with_reduce | forall | The cuda/hip exec policy that is | +| | | recommended for use with reducers. | +| | | In general using the occupancy | +| | | calculator policies are better for | +| | | reducers but exactly how much | +| | | occupancy to use differs by platform | +| | | so this policy provides a simple way | +| | | to get what works best for a platform | +| | | without having to know the details. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_base | | cuda/hip_exec_with_reduce policies | +| | | based on the with_reduce boolean. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_grid | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size. | +| | | Note that the thread-block | +| | | size and grid size must be | +| | | provided, there is no default. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_occ_max | forall | Execute loop iterations | +| | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size bounded | +| | | by the maximum occupancy of | +| | | the kernel. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_occ_calc | forall | Similar to the occ_max | +| | | policy but may use less | +| | | than the maximum occupancy | +| | | determined by the occupancy | +| | | calculator of the kernel for | +| | | performance reasons. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_occ_fraction> | | of the maximum occupancy | +| | | of the kernel. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_exec_occ_custom | | policy but the grid size | +| | | is determined by the | +| | | concretizer. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_launch_t | launch | Launches a device kernel, | +| | | any code expressed within | +| | | the lambda is executed | +| | | on the device. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_x_direct | kernel (For) | Map loop iterates | +| | launch (loop) | directly to GPU threads | +| | | in x-dimension, one | +| | | iterate per thread | +| | | (see note below about | +| | | limitations) | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_x_loop | kernel (For) | Similar to | +| | launch (loop) | thread-x-direct | +| | | policy, but use a | +| | | block-stride loop which | +| | | doesn't limit number of | +| | | loop iterates | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_y_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in y-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_z_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in z-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_syncable_loop | kernel (For) | Similar to thread-loop | +| | launch (loop) | policy, but safe to use | +| | | with Cuda/HipSyncThreads | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_size_x_direct| kernel (For) | Same as thread_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | threads | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_size_y_direct| kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_thread_size_z_direct| kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_flatten_threads_{xyz}_direct | launch (loop) | Reshapes threads in a | +| | | multi-dimensional thread | +| | | team into one-dimension, | +| | | accepts any permutation | +| | | of dimensions | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_x_direct | kernel (For) | Map loop iterates | +| | launch (loop) | directly to GPU thread | +| | | blocks in x-dimension, | +| | | one iterate per block | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_x_loop | kernel (For) | Similar to | +| | launch (loop) | block-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in y-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in z-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_size_x_direct | kernel (For) | Same as block_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | blocks | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_block_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_x_direct | kernel (For) | Creates a unique thread | +| | launch (loop) | id for each thread on | +| | | x-dimension of the grid. | +| | | Same as computing | +| | | threadIdx.x + | +| | | threadDim.x * blockIdx.x. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_y_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in y-dimension. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_z_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in z-dimension. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_x_loop | kernel (For) | Similar to | +| | launch (loop) | global-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in y-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in z-dimension | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_size_x_direct| kernel (For) | Same as global_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time block | +| | | size | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_size_y_direct| kernel (For) | Same as above, but map | +| | launch (loop) | to globals in y-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_global_size_z_direct| kernel (For) | Same as above, but map | +| | launch (loop) | to globals in z-dim | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_warp_direct | kernel (For) | Map work to threads | +| | | in a warp directly. | +| | | Cannot be used in | +| | | conjunction with | +| | | cuda/hip_thread_x_* | +| | | policies. | +| | | Multiple warps can be | +| | | created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++-----------------------------------------+---------------+---------------------------------------+ +| cuda/hip_warp_loop | kernel (For) | Policy to map work to | +| | | threads in a warp using | +| | | a warp-stride loop. | +| | | Cannot be used in | +| | | conjunction with | +| | | cuda/hip_thread_x_* | +| | | policies. | +| | | Multiple warps can be | +| | | created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++-----------------------------------------+---------------+--------------------------------------+ +| cuda/hip_warp_masked_direct>| kernel | Policy to map work | +| | (For) | directly to threads in a | +| | | warp using a bit mask. | +| | | Cannot be used in | +| | | conjunction with | +| | | cuda/hip_thread_x_* | +| | | policies. | +| | | Multiple warps can | +| | | be created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++-----------------------------------------+---------------+--------------------------------------+ +| cuda/hip_warp_masked_loop> | kernel | Policy to map work to | +| | (For) | threads in a warp using | +| | | a bit mask and a | +| | | warp-stride loop. Cannot | +| | | be used in conjunction | +| | | with cuda/hip_thread_x_* | +| | | policies. Multiple warps | +| | | can be created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++-----------------------------------------+---------------+--------------------------------------+ +| cuda/hip_block_reduce | kernel | Perform a reduction | +| | (Reduce) | across a single GPU | +| | | thread block. | ++-----------------------------------------+---------------+--------------------------------------+ +| cuda/hip_warp_reduce | kernel | Perform a reduction | +| | (Reduce) | across a single GPU | +| | | thread warp. | ++-----------------------------------------+---------------+--------------------------------------+ When a CUDA or HIP policy leaves parameters like the block size and/or grid size unspecified a concretizer object is used to decide those parameters. The From deee60881fa0dd2d1b42b3221ad289ec78be91ed Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 May 2024 14:33:50 -0700 Subject: [PATCH 094/108] Improve table formatting --- docs/sphinx/user_guide/feature/policies.rst | 465 ++++++++++---------- 1 file changed, 227 insertions(+), 238 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 50301084ef..5fd9a3f92e 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -236,244 +236,233 @@ RAJA policies for GPU execution using CUDA or HIP are essentially identical. The only difference is that CUDA policies have the prefix ``cuda_`` and HIP policies have the prefix ``hip_``. -+-----------------------------------------+---------------+---------------------------------------+ -| CUDA/HIP Execution Policies | Works with | Brief description | -+=========================================+===============+=======================================+ -| cuda/hip_exec | forall, | Execute loop iterations | -| | scan, | directly mapped to global threads | -| | sort | in a GPU kernel launched | -| | | with given thread-block | -| | | size and unbounded grid size. | -| | | Note that the thread-block | -| | | size must be provided, | -| | | there is no default. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_with_reduce | forall | The cuda/hip exec policy that is | -| | | recommended for use with reducers. | -| | | In general using the occupancy | -| | | calculator policies are better for | -| | | reducers but exactly how much | -| | | occupancy to use differs by platform | -| | | so this policy provides a simple way | -| | | to get what works best for a platform | -| | | without having to know the details. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_base | | cuda/hip_exec_with_reduce policies | -| | | based on the with_reduce boolean. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_grid | | mapped to global threads via | -| | | grid striding with multiple | -| | | iterations per global thread | -| | | in a GPU kernel launched | -| | | with given thread-block | -| | | size and grid size. | -| | | Note that the thread-block | -| | | size and grid size must be | -| | | provided, there is no default. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_occ_max | forall | Execute loop iterations | -| | | mapped to global threads via | -| | | grid striding with multiple | -| | | iterations per global thread | -| | | in a GPU kernel launched | -| | | with given thread-block | -| | | size and grid size bounded | -| | | by the maximum occupancy of | -| | | the kernel. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_occ_calc | forall | Similar to the occ_max | -| | | policy but may use less | -| | | than the maximum occupancy | -| | | determined by the occupancy | -| | | calculator of the kernel for | -| | | performance reasons. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_occ_fraction> | | of the maximum occupancy | -| | | of the kernel. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_exec_occ_custom | | policy but the grid size | -| | | is determined by the | -| | | concretizer. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_launch_t | launch | Launches a device kernel, | -| | | any code expressed within | -| | | the lambda is executed | -| | | on the device. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_x_direct | kernel (For) | Map loop iterates | -| | launch (loop) | directly to GPU threads | -| | | in x-dimension, one | -| | | iterate per thread | -| | | (see note below about | -| | | limitations) | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_y_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to threads in y-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_z_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to threads in z-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_x_loop | kernel (For) | Similar to | -| | launch (loop) | thread-x-direct | -| | | policy, but use a | -| | | block-stride loop which | -| | | doesn't limit number of | -| | | loop iterates | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_y_loop | kernel (For) | Same as above, but for | -| | launch (loop) | threads in y-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_z_loop | kernel (For) | Same as above, but for | -| | launch (loop) | threads in z-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_syncable_loop | kernel (For) | Similar to thread-loop | -| | launch (loop) | policy, but safe to use | -| | | with Cuda/HipSyncThreads | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_size_x_direct| kernel (For) | Same as thread_x_direct | -| | launch (loop) | policy above but with | -| | | a compile time number of | -| | | threads | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_size_y_direct| kernel (For) | Same as above, but map | -| | launch (loop) | to threads in y-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_thread_size_z_direct| kernel (For) | Same as above, but map | -| | launch (loop) | to threads in z-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_flatten_threads_{xyz}_direct | launch (loop) | Reshapes threads in a | -| | | multi-dimensional thread | -| | | team into one-dimension, | -| | | accepts any permutation | -| | | of dimensions | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_x_direct | kernel (For) | Map loop iterates | -| | launch (loop) | directly to GPU thread | -| | | blocks in x-dimension, | -| | | one iterate per block | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_y_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to blocks in y-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_z_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to blocks in z-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_x_loop | kernel (For) | Similar to | -| | launch (loop) | block-x-direct policy, | -| | | but use a grid-stride | -| | | loop. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_y_loop | kernel (For) | Same as above, but use | -| | launch (loop) | blocks in y-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_z_loop | kernel (For) | Same as above, but use | -| | launch (loop) | blocks in z-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_size_x_direct | kernel (For) | Same as block_x_direct | -| | launch (loop) | policy above but with | -| | | a compile time number of | -| | | blocks | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_size_y_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to blocks in y-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_block_size_z_direct | kernel (For) | Same as above, but map | -| | launch (loop) | to blocks in z-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_x_direct | kernel (For) | Creates a unique thread | -| | launch (loop) | id for each thread on | -| | | x-dimension of the grid. | -| | | Same as computing | -| | | threadIdx.x + | -| | | threadDim.x * blockIdx.x. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_y_direct | kernel (For) | Same as above, but uses | -| | launch (loop) | globals in y-dimension. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_z_direct | kernel (For) | Same as above, but uses | -| | launch (loop) | globals in z-dimension. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_x_loop | kernel (For) | Similar to | -| | launch (loop) | global-x-direct policy, | -| | | but use a grid-stride | -| | | loop. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_y_loop | kernel (For) | Same as above, but use | -| | launch (loop) | globals in y-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_z_loop | kernel (For) | Same as above, but use | -| | launch (loop) | globals in z-dimension | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_size_x_direct| kernel (For) | Same as global_x_direct | -| | launch (loop) | policy above but with | -| | | a compile time block | -| | | size | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_size_y_direct| kernel (For) | Same as above, but map | -| | launch (loop) | to globals in y-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_global_size_z_direct| kernel (For) | Same as above, but map | -| | launch (loop) | to globals in z-dim | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_warp_direct | kernel (For) | Map work to threads | -| | | in a warp directly. | -| | | Cannot be used in | -| | | conjunction with | -| | | cuda/hip_thread_x_* | -| | | policies. | -| | | Multiple warps can be | -| | | created by using | -| | | cuda/hip_thread_y/z_* | -| | | policies. | -+-----------------------------------------+---------------+---------------------------------------+ -| cuda/hip_warp_loop | kernel (For) | Policy to map work to | -| | | threads in a warp using | -| | | a warp-stride loop. | -| | | Cannot be used in | -| | | conjunction with | -| | | cuda/hip_thread_x_* | -| | | policies. | -| | | Multiple warps can be | -| | | created by using | -| | | cuda/hip_thread_y/z_* | -| | | policies. | -+-----------------------------------------+---------------+--------------------------------------+ -| cuda/hip_warp_masked_direct>| kernel | Policy to map work | -| | (For) | directly to threads in a | -| | | warp using a bit mask. | -| | | Cannot be used in | -| | | conjunction with | -| | | cuda/hip_thread_x_* | -| | | policies. | -| | | Multiple warps can | -| | | be created by using | -| | | cuda/hip_thread_y/z_* | -| | | policies. | -+-----------------------------------------+---------------+--------------------------------------+ -| cuda/hip_warp_masked_loop> | kernel | Policy to map work to | -| | (For) | threads in a warp using | -| | | a bit mask and a | -| | | warp-stride loop. Cannot | -| | | be used in conjunction | -| | | with cuda/hip_thread_x_* | -| | | policies. Multiple warps | -| | | can be created by using | -| | | cuda/hip_thread_y/z_* | -| | | policies. | -+-----------------------------------------+---------------+--------------------------------------+ -| cuda/hip_block_reduce | kernel | Perform a reduction | -| | (Reduce) | across a single GPU | -| | | thread block. | -+-----------------------------------------+---------------+--------------------------------------+ -| cuda/hip_warp_reduce | kernel | Perform a reduction | -| | (Reduce) | across a single GPU | -| | | thread warp. | -+-----------------------------------------+---------------+--------------------------------------+ ++----------------------------------------------------+---------------+---------------------------------+ +| CUDA/HIP Execution Policies | Works with | Brief description | ++====================================================+===============+=================================+ +| cuda/hip_exec | forall, | Execute loop iterations | +| | scan, | directly mapped to global | +| | sort | threads in a GPU kernel | +| | | launched with given threadblock | +| | | size and unbounded grid size. | +| | | Note that the threadblock | +| | | size must be provided. | +| | | There is no default. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_with_reduce | forall | The cuda/hip exec policy | +| | | recommended for use with | +| | | kernels containing reductions. | +| | | In general, using the occupancy | +| | | calculator policies improves | +| | | performance of kernels with | +| | | reductions. Exactly how much | +| | | occupancy to use differs by | +| | | platform. This policy provides | +| | | a simple way to get what works | +| | | well for a platform without | +| | | having to know the details. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_base | forall | Choose between cuda/hip_exec | +| | | and cuda/hip_exec_with_reduce | +| | | policies based on the boolean | +| | | template parameter 'with_reduce'| ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_grid | forall | Execute loop iterations | +| | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size. | +| | | Note that the thread-block | +| | | size and grid size must be | +| | | provided, there is no default. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_max | forall | Execute loop iterations | +| | | mapped to global threads via | +| | | grid striding with multiple | +| | | iterations per global thread | +| | | in a GPU kernel launched | +| | | with given thread-block | +| | | size and grid size bounded | +| | | by the maximum occupancy of | +| | | the kernel. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_calc | forall | Similar to the occ_max | +| | | policy but may use less | +| | | than the maximum occupancy | +| | | determined by the occupancy | +| | | calculator of the kernel for | +| | | performance reasons. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_fraction> | | | +| | | | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_exec_occ_custom | forall | Similar to the occ_max policy | +| | | policy but the grid size is | +| | | is determined by concretizer. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_launch_t | launch | Launches a device kernel, any | +| | | code inside the lambda | +| | | expression is executed | +| | | on the device. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_x_direct | kernel (For) | Map loop iterates directly to | +| | launch (loop) | GPU threads in x-dimension, one | +| | | iterate per thread. See note | +| | | below about limitations. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_x_loop | kernel (For) | Similar to thread-x-direct | +| | launch (loop) | policy, but use a block-stride | +| | | loop which doesn't limit total | +| | | number of loop iterates. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_y_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_z_loop | kernel (For) | Same as above, but for | +| | launch (loop) | threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_syncable_loop | kernel (For) | Similar to thread-loop | +| | launch (loop) | policy, but safe to use | +| | | with Cuda/HipSyncThreads. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_x_direct | kernel (For) | Same as thread_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | threads. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_thread_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to threads in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_flatten_threads_{xyz}_direct | launch (loop) | Reshapes threads in a | +| | | multi-dimensional thread | +| | | team into one-dimension, | +| | | accepts any permutation | +| | | of dimensions | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_x_direct | kernel (For) | Map loop iterates | +| | launch (loop) | directly to GPU thread | +| | | blocks in x-dimension, | +| | | one iterate per block | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_x_loop | kernel (For) | Similar to | +| | launch (loop) | block-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | blocks in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_x_direct | kernel (For) | Same as block_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time number of | +| | | blocks | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in y-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to blocks in z-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_x_direct | kernel (For) | Creates a unique thread | +| | launch (loop) | id for each thread on | +| | | x-dimension of the grid. | +| | | Same as computing | +| | | threadIdx.x + | +| | | threadDim.x * blockIdx.x. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_y_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in y-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_z_direct | kernel (For) | Same as above, but uses | +| | launch (loop) | globals in z-dimension. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_x_loop | kernel (For) | Similar to | +| | launch (loop) | global-x-direct policy, | +| | | but use a grid-stride | +| | | loop. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_y_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in y-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_z_loop | kernel (For) | Same as above, but use | +| | launch (loop) | globals in z-dimension | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_x_direct | kernel (For) | Same as global_x_direct | +| | launch (loop) | policy above but with | +| | | a compile time block | +| | | size | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_y_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to globals in y-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_global_size_z_direct | kernel (For) | Same as above, but map | +| | launch (loop) | to globals in z-dim | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_direct | kernel (For) | Map work to threads | +| | | in a warp directly. | +| | | Cannot be used in | +| | | conjunction with | +| | | cuda/hip_thread_x_* | +| | | policies. | +| | | Multiple warps can be | +| | | created by using | +| | | cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_loop | kernel (For) | Map work to threads in a warp | +| | | using a warp-stride loop. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_masked_direct> | kernel | Mmap work directly to threads | +| | (For) | in a warp using a bit mask. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_masked_loop> | kernel | Map work to threads in a warp | +| | (For) | using a bit mask and a warp- | +| | | stride loop. | +| | | Cannot be used with | +| | | cuda/hip_thread_x_* policies. | +| | | Multiple warps can be created | +| | | by using cuda/hip_thread_y/z_* | +| | | policies. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_block_reduce | kernel | Perform a reduction across a | +| | (Reduce) | single GPU thread block. | ++----------------------------------------------------+---------------+---------------------------------+ +| cuda/hip_warp_reduce | kernel | Perform a reduction across a | +| | (Reduce) | single GPU thread warp. | +| | | thread warp. | ++----------------------------------------------------+---------------+---------------------------------+ When a CUDA or HIP policy leaves parameters like the block size and/or grid size unspecified a concretizer object is used to decide those parameters. The From b12ff8faebfb365f8262ec9c7720f6956958f764 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 13:50:48 -0700 Subject: [PATCH 095/108] Fix concretizer table --- docs/sphinx/user_guide/feature/policies.rst | 52 ++++++++++----------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 5fd9a3f92e..4c69ef44ca 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -469,33 +469,31 @@ unspecified a concretizer object is used to decide those parameters. The following concretizers are available to use in the ``cuda/hip_exec_occ_custom`` policies: -=================================================== ========================================= -Execution Policy Brief description -=================================================== ========================================= - -Cuda/HipDefaultConcretizer The default concretizer, expected to - provide good performance in general. - Note that it may not use max occupancy. - -Cuda/HipRecForReduceConcretizer Expected to provide good performance - in loops with reducers. - Note that it may not use max occupancy. - -Cuda/HipMaxOccupancyConcretizer Uses max occupancy. - -Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer Avoids using the max occupancy of the - device in terms of threads. - Note that it may use the max occupancy - of the kernel if that is below the max - occupancy of the device. - -Cuda/HipFractionOffsetOccupancyConcretizer< Uses a fraction and offset to choose an -Fraction, occupancy based on the max occupancy -BLOCKS_PER_SM_OFFSET> Using the following formula: - (Fraction * kernel_max_blocks_per_sm + - BLOCKS_PER_SM_OFFSET) * sm_per_device - -=================================================== ========================================= ++----------------------------------------------------+-----------------------------------------+ +| Execution Policy | Brief description | ++====================================================+=========================================+ +| Cuda/HipDefaultConcretizer | The default concretizer, expected to | +| | provide good performance in general. | +| | Note that it may not use max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipRecForReduceConcretizer | Expected to provide good performance | +| | in loops with reducers. | +| | Note that it may not use max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipMaxOccupancyConcretizer | Uses max occupancy. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipAvoidDeviceMaxThreadOccupancyConcretizer | Avoids using the max occupancy of the | +| | device in terms of threads. | +| | Note that it may use the max occupancy | +| | of the kernel if that is below the max | +| | occupancy of the device. | ++----------------------------------------------------+-----------------------------------------+ +| Cuda/HipFractionOffsetOccupancyConcretizer< | Uses a fraction and offset to choose an | +| Fraction, | occupancy based on the max occupancy | +| BLOCKS_PER_SM_OFFSET> | Using the following formula: | +| | (Fraction * kernel_max_blocks_per_sm + | +| | BLOCKS_PER_SM_OFFSET) * sm_per_device | ++----------------------------------------------------+-----------------------------------------+ Several notable constraints apply to RAJA CUDA/HIP *direct* policies. From f22d6a7e4573aae604c3632ce516ccce862ac619 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 May 2024 14:53:18 -0700 Subject: [PATCH 096/108] Attempt to claarify some reduction policy type names. --- docs/sphinx/user_guide/feature/policies.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 4c69ef44ca..7660ab2a90 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -797,20 +797,20 @@ cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_r policy cuda/hip_reduce_atomic policies based on the with_atomic boolean. cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the - memory used for atomics on the host. + policy memory used for atomics on the host. This works on recent architectures and incurs lower overheads. cuda/hip_reduce\*device_init\* any CUDA/HIP Same as above, but initializes the - memory used for atomics on the device. + policy memory used for atomics on the device. This works on all architectures but incurs higher overheads. -cuda/hip_reduce\*device_fence any CUDA/HIP Same as above, and reduction uses normal +cuda/hip_reduce_device_fence any CUDA/HIP Same as above, and reduction uses normal policy memory accesses that are not visible across the whole device and device scope fences to ensure visibility and ordering. This works on all architectures but incurs higher overheads on some architectures. -cuda/hip_reduce\*block_fence any CUDA/HIP Same as above, and reduction uses special +cuda/hip_reduce_block_fence any CUDA/HIP Same as above, and reduction uses special policy memory accesses to a level of cache shared visible to the whole device and block scope fences to ensure ordering. This improves From ed9bb20700c8c37af296c88f2e6e45b723472901 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 May 2024 14:58:55 -0700 Subject: [PATCH 097/108] Fix SYCL policy table --- docs/sphinx/user_guide/feature/policies.rst | 177 ++++++++++---------- 1 file changed, 88 insertions(+), 89 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 7660ab2a90..e68e990d50 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -577,95 +577,94 @@ GPU Policies for SYCL 2 always exists and should be used as one would use the x dimension for CUDA and HIP. - ======================================== ============= ============================== - SYCL Execution Policies Works with Brief description - ======================================== ============= ============================== - sycl_exec forall, Execute loop iterations - in a GPU kernel launched - with given work group - size. - sycl_launch_t launch Launches a sycl kernel, - any code express within - the lambda is executed - on the device. - sycl_global_0 kernel (For) Map loop iterates - directly to GPU global - ids in first - dimension, one iterate - per work item. Group - execution into work - groups of given size. - sycl_global_1 kernel (For) Same as above, but map - to global ids in second - dim - sycl_global_2 kernel (For) Same as above, but map - to global ids in third - dim - sycl_global_item_0 launch (loop) Creates a unique thread - id for each thread for - dimension 0 of the grid. - Same as computing - itm.get_group(0) * - itm.get_local_range(0) + - itm.get_local_id(0). - sycl_global_item_1 launch (loop) Same as above, but uses - threads in dimension 1 - Same as computing - itm.get_group(1) + - itm.get_local_range(1) * - itm.get_local_id(1). - sycl_global_item_2 launch (loop) Same as above, but uses - threads in dimension 2 - Same as computing - itm.get_group(2) + - itm.get_local_range(2) * - itm.get_local_id(2). - sycl_local_0_direct kernel (For) Map loop iterates - launch (loop) directly to GPU work - items in first - dimension, one iterate - per work item (see note - below about limitations) - sycl_local_1_direct kernel (For) Same as above, but map - launch (loop) to work items in second - dim - sycl_local_2_direct kernel (For) Same as above, but map - launch (loop) to work items in third - dim - sycl_local_0_loop kernel (For) Similar to - launch (loop) local-1-direct policy, - but use a work - group-stride loop which - doesn't limit number of - loop iterates - sycl_local_1_loop kernel (For) Same as above, but for - launch (loop) work items in second - dimension - sycl_local_2_loop kernel (For) Same as above, but for - launch (loop) work items in third - dimension - sycl_group_0_direct kernel (For) Map loop iterates - launch (loop) directly to GPU group - ids in first dimension, - one iterate per group - sycl_group_1_direct kernel (For) Same as above, but map - launch (loop) to groups in second - dimension - sycl_group_2_direct kernel (For) Same as above, but map - launch (loop) to groups in third - dimension - sycl_group_0_loop kernel (For) Similar to - launch (loop) group-1-direct policy, - but use a group-stride - loop. - sycl_group_1_loop kernel (For) Same as above, but use - launch (loop) groups in second - dimension - sycl_group_2_loop kernel (For) Same as above, but use - launch (loop) groups in third - dimension - - ======================================== ============= ============================== +======================================== ============= ============================== +SYCL Execution Policies Works with Brief description +======================================== ============= ============================== +sycl_exec forall, Execute loop iterations + in a GPU kernel launched + with given work group + size. +sycl_launch_t launch Launches a sycl kernel, + any code express within + the lambda is executed + on the device. +sycl_global_0 kernel (For) Map loop iterates + directly to GPU global + ids in first + dimension, one iterate + per work item. Group + execution into work + groups of given size. +sycl_global_1 kernel (For) Same as above, but map + to global ids in second + dim +sycl_global_2 kernel (For) Same as above, but map + to global ids in third + dim +sycl_global_item_0 launch (loop) Creates a unique thread + id for each thread for + dimension 0 of the grid. + Same as computing + itm.get_group(0) * + itm.get_local_range(0) + + itm.get_local_id(0). +sycl_global_item_1 launch (loop) Same as above, but uses + threads in dimension 1 + Same as computing + itm.get_group(1) + + itm.get_local_range(1) * + itm.get_local_id(1). +sycl_global_item_2 launch (loop) Same as above, but uses + threads in dimension 2 + Same as computing + itm.get_group(2) + + itm.get_local_range(2) * + itm.get_local_id(2). +sycl_local_0_direct kernel (For) Map loop iterates + launch (loop) directly to GPU work + items in first + dimension, one iterate + per work item (see note + below about limitations) +sycl_local_1_direct kernel (For) Same as above, but map + launch (loop) to work items in second + dim +sycl_local_2_direct kernel (For) Same as above, but map + launch (loop) to work items in third + dim +sycl_local_0_loop kernel (For) Similar to + launch (loop) local-1-direct policy, + but use a work + group-stride loop which + doesn't limit number of + loop iterates +sycl_local_1_loop kernel (For) Same as above, but for + launch (loop) work items in second + dimension +sycl_local_2_loop kernel (For) Same as above, but for + launch (loop) work items in third + dimension +sycl_group_0_direct kernel (For) Map loop iterates + launch (loop) directly to GPU group + ids in first dimension, + one iterate per group +sycl_group_1_direct kernel (For) Same as above, but map + launch (loop) to groups in second + dimension +sycl_group_2_direct kernel (For) Same as above, but map + launch (loop) to groups in third + dimension +sycl_group_0_loop kernel (For) Similar to + launch (loop) group-1-direct policy, + but use a group-stride + loop. +sycl_group_1_loop kernel (For) Same as above, but use + launch (loop) groups in second + dimension +sycl_group_2_loop kernel (For) Same as above, but use + launch (loop) groups in third + dimension +======================================== ============= ============================== OpenMP Target Offload Policies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 960c251c007bfae35f0fa76930d567411253b2d1 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 15:01:49 -0700 Subject: [PATCH 098/108] expand reduce policy table --- docs/sphinx/user_guide/feature/policies.rst | 98 +++++++++++---------- 1 file changed, 53 insertions(+), 45 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index e68e990d50..181d1a5754 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -242,7 +242,7 @@ policies have the prefix ``hip_``. | cuda/hip_exec | forall, | Execute loop iterations | | | scan, | directly mapped to global | | | sort | threads in a GPU kernel | -| | | launched with given threadblock | +| | | launched with given threadblock | | | | size and unbounded grid size. | | | | Note that the threadblock | | | | size must be provided. | @@ -297,7 +297,7 @@ policies have the prefix ``hip_``. | cuda/hip_exec_occ_fraction> | | | +| denominator>> | | | | | | | +----------------------------------------------------+---------------+---------------------------------+ | cuda/hip_exec_occ_custom | forall | Similar to the occ_max policy | @@ -775,49 +775,57 @@ It is important to note the following constraints about RAJA reduction usage: The following table summarizes RAJA reduction policy types: -======================================== ============= ========================================== -Reduction Policy Loop Policies Brief description - to Use With -======================================== ============= ========================================== -seq_reduce seq_exec, Non-parallel (sequential) reduction. -omp_reduce any OpenMP OpenMP parallel reduction. - policy -omp_reduce_ordered any OpenMP OpenMP parallel reduction with result - policy guaranteed to be reproducible. -omp_target_reduce any OpenMP OpenMP parallel target offload reduction. - target policy -cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel - policy (device synchronization will occur when - reduction value is finalized). -cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use - policy atomic operations leading to run to run - variability in the results. -cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and - policy cuda/hip_reduce_atomic policies based on - the with_atomic boolean. -cuda/hip_reduce\*host_init\* any CUDA/HIP Same as above, but initializes the - policy memory used for atomics on the host. - This works on recent architectures and - incurs lower overheads. -cuda/hip_reduce\*device_init\* any CUDA/HIP Same as above, but initializes the - policy memory used for atomics on the device. - This works on all architectures but - incurs higher overheads. -cuda/hip_reduce_device_fence any CUDA/HIP Same as above, and reduction uses normal - policy memory accesses that are not visible across - the whole device and device scope fences - to ensure visibility and ordering. - This works on all architectures but - incurs higher overheads on some architectures. -cuda/hip_reduce_block_fence any CUDA/HIP Same as above, and reduction uses special - policy memory accesses to a level of cache shared - visible to the whole device and block scope - fences to ensure ordering. This improves - performance on some architectures. -sycl_reduce any SYCL Reduction in a SYCL kernel (device - policy synchronization will occur when the - reduction value is finalized). -======================================== ============= ========================================== +================================================= ============= ========================================== +Reduction Policy Loop Policies Brief description + to Use With +================================================= ============= ========================================== +seq_reduce seq_exec, Non-parallel (sequential) reduction. +omp_reduce any OpenMP OpenMP parallel reduction. + policy +omp_reduce_ordered any OpenMP OpenMP parallel reduction with result + policy guaranteed to be reproducible. +omp_target_reduce any OpenMP OpenMP parallel target offload reduction. + target policy +cuda/hip_reduce any CUDA/HIP Parallel reduction in a CUDA/HIP kernel + policy (device synchronization will occur when + reduction value is finalized). +cuda/hip_reduce_atomic any CUDA/HIP Same as above, but reduction may use + policy atomic operations leading to run to run + variability in the results. +cuda/hip_reduce_base any CUDA/HIP Choose between cuda/hip_reduce and + policy cuda/hip_reduce_atomic policies based on + the with_atomic boolean. +cuda/hip_reduce_device_fence any CUDA/HIP Same as above, and reduction uses normal + policy memory accesses that are not visible across + the whole device and device scope fences + to ensure visibility and ordering. + This works on all architectures but + incurs higher overheads on some architectures. +cuda/hip_reduce_block_fence any CUDA/HIP Same as above, and reduction uses special + policy memory accesses to a level of cache + visible to the whole device and block scope + fences to ensure ordering. This improves + performance on some architectures. +cuda/hip_reduce_atomic_host_init_device_fence any CUDA/HIP Same as above with device fence, but + policy initializes the memory used for atomics + on the host. This works well on recent + architectures and incurs lower overheads. +cuda/hip_reduce_atomic_host_init_block_fence any CUDA/HIP Same as above with block fence, but + policy initializes the memory used for atomics + on the host. This works well on recent + architectures and incurs lower overheads. +cuda/hip_reduce_atomic_device_init_device_fence any CUDA/HIP Same as above with device fence, but + policy initializes the memory used for atomics + on the device. This works on all architectures + but incurs higher overheads. +cuda/hip_reduce_atomic_device_init_block_fence any CUDA/HIP Same as above with block fence, but + policy initializes the memory used for atomics + on the device. This works on all architectures + but incurs higher overheads. +sycl_reduce any SYCL Reduction in a SYCL kernel (device + policy synchronization will occur when the + reduction value is finalized). +================================================= ============= ========================================== .. note:: RAJA reductions used with SIMD execution policies are not guaranteed to generate correct results. So they should not be used From 960c30378ddfd39257d434315a84c87195278978 Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 15:08:18 -0700 Subject: [PATCH 099/108] try something with multi-line tables --- docs/sphinx/user_guide/feature/policies.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index 181d1a5754..a7b3c1332a 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -489,8 +489,8 @@ policies: | | occupancy of the device. | +----------------------------------------------------+-----------------------------------------+ | Cuda/HipFractionOffsetOccupancyConcretizer< | Uses a fraction and offset to choose an | -| Fraction, | occupancy based on the max occupancy | -| BLOCKS_PER_SM_OFFSET> | Using the following formula: | +| Fraction, | occupancy based on the max occupancy | +| BLOCKS_PER_SM_OFFSET> | Using the following formula: | | | (Fraction * kernel_max_blocks_per_sm + | | | BLOCKS_PER_SM_OFFSET) * sm_per_device | +----------------------------------------------------+-----------------------------------------+ From 7f85f630393597b005aee91ad346cdbcb5ce683f Mon Sep 17 00:00:00 2001 From: Jason Burmark Date: Thu, 2 May 2024 15:19:41 -0700 Subject: [PATCH 100/108] fix more multi line table entries --- docs/sphinx/user_guide/feature/policies.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index a7b3c1332a..afd2d7638a 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -295,9 +295,9 @@ policies have the prefix ``hip_``. | | | performance reasons. | +----------------------------------------------------+---------------+---------------------------------+ | cuda/hip_exec_occ_fraction> | | | +| Fraction> | | but use a fraction of the | +| | | maximum occupancy of the kernel.| +| | | | | | | | +----------------------------------------------------+---------------+---------------------------------+ | cuda/hip_exec_occ_custom | forall | Similar to the occ_max policy | From bc5e371a14610c70a4e329c04570076d02a325ef Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Thu, 2 May 2024 15:21:01 -0700 Subject: [PATCH 101/108] Fix SYCL exec policy table formatting and clarify note about SYCL reverse ordering from CUDA/HIP --- docs/sphinx/user_guide/feature/policies.rst | 43 +++++++++++++++------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/docs/sphinx/user_guide/feature/policies.rst b/docs/sphinx/user_guide/feature/policies.rst index e68e990d50..dd0b14ccd9 100644 --- a/docs/sphinx/user_guide/feature/policies.rst +++ b/docs/sphinx/user_guide/feature/policies.rst @@ -561,22 +561,41 @@ write more explicit policies. GPU Policies for SYCL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. note:: SYCL uses C++-style ordering in which the right - most index corresponds to having unit stride. - In a three-dimensional compute grid this means - that dimension 2 has the unit stride while - dimension 0 has the longest stride. This is - important to note as the ordering is reverse - compared to the CUDA and HIP programming models. - CUDA and HIP employ a x/y/z ordering in which - dimension x has the unit stride. - - When using RAJA::launch, thread and team configuration +.. note:: SYCL uses C++-style ordering for its work group and global thread + dimension/indexing types. This is due, in part, to SYCL's closer + alignment with C++ multi-dimensional indexing, which is "row-major". + This is the reverse of the thread indexing used in CUDA or HIP, + which is "column-major". For example, suppose we have a thread-block + or work-group where we specify the shape as (nx, ny, nz). Consider + an element in the thread-block or work-group with id (x, y, z). + In CUDA or HIP, the element index is x + y * nx + z * nx * ny. In + SYCL, the element index is z + y * nz + x * nz * ny. + + In terms of the CUDA or HIP built-in variables to support threads, + we have:: + + Thread ID: threadIdx.x/y/z + Block ID: blockIdx.x/y/z + Block dimension: blockDim.x/y/z + Grid dimension: gridDim.x/y/z + + The analogues in SYCL are:: + + Thread ID: sycl::nd_item.get_local_id(2/1/0) + Work-group ID: sycl::nd_item.get_group(2/1/0) + Work-group dimensions: sycl::nd_item.get_local_range().get(2/1/0) + ND-range dimensions: sycl::nd_item.get_group_range(2/1/0) + + When using ``RAJA::launch``, thread and block configuration follows CUDA and HIP programming models and is always - configured in three-dimensions. This means that dimension + configured in three-dimensions. This means that SYCL dimension 2 always exists and should be used as one would use the x dimension for CUDA and HIP. + Similarly, ``RAJA::kernel`` uses a three-dimensional work-group + configuration. SYCL imension 2 always exists and should be used as + one would use the x dimension in CUDA and HIP. + ======================================== ============= ============================== SYCL Execution Policies Works with Brief description ======================================== ============= ============================== From 89004eb0cdb58392aef2a5f4f32fb8f9d7813e12 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 May 2024 11:30:46 -0700 Subject: [PATCH 102/108] Bumping poodle allocation time to prevent intel 19 timeout --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index cee458cd60..6593a4e357 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -25,7 +25,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle From 70d4ccedd595872cff1fd6d9fdbeb7fb72f3251c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 May 2024 13:38:12 -0700 Subject: [PATCH 103/108] Remove remnants of loop_exec and associated policies. These were deprecated a few releases back, but were maintained at the request of some users. --- include/RAJA/RAJA.hpp | 7 --- include/RAJA/policy/loop.hpp | 35 ------------ include/RAJA/policy/loop/policy.hpp | 87 ----------------------------- 3 files changed, 129 deletions(-) delete mode 100644 include/RAJA/policy/loop.hpp delete mode 100644 include/RAJA/policy/loop/policy.hpp diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp index 5478392ff1..c37ac997a4 100644 --- a/include/RAJA/RAJA.hpp +++ b/include/RAJA/RAJA.hpp @@ -59,13 +59,6 @@ // #include "RAJA/policy/sequential.hpp" -// -// NOTE: LOOP POLCIES WERE DEPRECATED IN 2023.03.0 RELEASE. -// THEY ARE RE-ADDED HERE AT REQUEST OF USERS. -// THEY WILL BE REMOVED AGAIN IN THE FUTURE. -// -#include "RAJA/policy/loop.hpp" - // // All platforms should support simd and vector execution. // diff --git a/include/RAJA/policy/loop.hpp b/include/RAJA/policy/loop.hpp deleted file mode 100644 index 2cd9525dcd..0000000000 --- a/include/RAJA/policy/loop.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/*! -****************************************************************************** -* -* \file -* -* \brief Header file containing RAJA headers for sequential execution. -* -* These methods work on all platforms. -* -****************************************************************************** -*/ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef RAJA_loop_HPP -#define RAJA_loop_HPP - -#if !defined(RAJA_ENABLE_DESUL_ATOMICS) - #include "RAJA/policy/sequential/atomic.hpp" -#endif - -#include "RAJA/policy/sequential/forall.hpp" -#include "RAJA/policy/sequential/kernel.hpp" -#include "RAJA/policy/loop/policy.hpp" -#include "RAJA/policy/sequential/scan.hpp" -#include "RAJA/policy/sequential/sort.hpp" -#include "RAJA/policy/sequential/launch.hpp" -#include "RAJA/policy/sequential/WorkGroup.hpp" - -#endif // closing endif for header file include guard diff --git a/include/RAJA/policy/loop/policy.hpp b/include/RAJA/policy/loop/policy.hpp deleted file mode 100644 index 1bf34250bb..0000000000 --- a/include/RAJA/policy/loop/policy.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/*! - ****************************************************************************** - * - * \file - * - * \brief Header file containing RAJA sequential policy definitions. - * - ****************************************************************************** - */ - -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-24, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#ifndef policy_loop_HPP -#define policy_loop_HPP - -#include "RAJA/policy/PolicyBase.hpp" - -#include "RAJA/policy/sequential/policy.hpp" - -namespace RAJA -{ -namespace policy -{ -namespace loop -{ - -// -////////////////////////////////////////////////////////////////////// -// -// Execution policies -// -////////////////////////////////////////////////////////////////////// -// - -/// -/// Segment execution policies -/// - -using loop_exec = seq_exec; - -/// -/// Index set segment iteration policies -/// -using loop_segit = seq_exec; - -/// -/// WorkGroup execution policies -/// -using loop_work = seq_work; - -/// -/////////////////////////////////////////////////////////////////////// -/// -/// Reduction execution policies -/// -/////////////////////////////////////////////////////////////////////// -/// -using loop_reduce = seq_reduce; - - -/// -/////////////////////////////////////////////////////////////////////// -/// -/// Atomic execution policies -/// -/////////////////////////////////////////////////////////////////////// -/// -using loop_atomic = seq_atomic; - -} // end namespace loop - -} // end namespace policy - -using policy::loop::loop_atomic; -using policy::loop::loop_exec; -using policy::loop::loop_reduce; -using policy::loop::loop_segit; -using policy::loop::loop_work; - -} // namespace RAJA - -#endif From 8b6ce63a54b5ce399f000d0a0fadb6c2f98b561e Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 May 2024 13:59:12 -0700 Subject: [PATCH 104/108] Update version number for release. --- CMakeLists.txt | 2 +- docs/conf.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e4823564b..9e5ecec0b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,7 +16,7 @@ include(CMakeDependentOption) # Set version number set(RAJA_VERSION_MAJOR 2024) set(RAJA_VERSION_MINOR 02) -set(RAJA_VERSION_PATCHLEVEL 1) +set(RAJA_VERSION_PATCHLEVEL 2) if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")) message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}") diff --git a/docs/conf.py b/docs/conf.py index 1570ed2888..3212170b30 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -88,7 +88,7 @@ # The short X.Y version. version = u'2024.02' # The full version, including alpha/beta/rc tags. -release = u'2024.02.1' +release = u'2024.02.2' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 4e0049ac5e8c47322bdb9477e2effe1c7ba56b6c Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 May 2024 14:00:19 -0700 Subject: [PATCH 105/108] Add release notes for v2024.02.2 release. --- RELEASE_NOTES.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 2e26861191..9efd9b277c 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -20,6 +20,39 @@ Notable changes include: * Bug fixes/improvements: +Version 2024.02.2 -- Release date 2024-05-08 +============================================ + +This release contains a bugfix and new execution policies that improve +performance for GPU kernels with reductions. + +Notable changes include: + + * New features / API changes: + * New CPU execution policies for CUDA and HIP added which provide + improved performance for GPU kernels with reductions. Please see the + RAJA User Guide for more information. Short summary: + * Option added to change max grid size in policies that use the + occupancy calculator. + * Policies added to run with max occupancy, a fraction of of the + max occupancy, and to run with a "concretizer" which allows a + user to determine how to run based on what the occupancy + calculator determines about a kernel. + * Additional options to tune kernels containing reductions, such as + * an option to initialize data on host for reductions that use + atomic operations + * an option to avoid device scope memory fences + * Change ordering of SYCL thread index ordering in RAJA::launch to + follow the SYCL "row-major" convention. Please see RAJA User Guide + for more information. + + * Build changes/improvements: + * NONE. + + * Bug fixes/improvements: + * Fixed issue in bump-style allocator used internally in RAJA::launch. + + Version 2024.02.1 -- Release date 2024-04-03 ============================================ From 89d87cd9ce0d5960a0dded5e9e37f558b5079585 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Fri, 3 May 2024 15:30:23 -0700 Subject: [PATCH 106/108] Fix typo. --- RELEASE_NOTES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 9efd9b277c..c2df2a03ea 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -29,7 +29,7 @@ performance for GPU kernels with reductions. Notable changes include: * New features / API changes: - * New CPU execution policies for CUDA and HIP added which provide + * New GPU execution policies for CUDA and HIP added which provide improved performance for GPU kernels with reductions. Please see the RAJA User Guide for more information. Short summary: * Option added to change max grid size in policies that use the From b745c98a9383a959cce92f1568995152e00a1a97 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 May 2024 09:15:00 -0700 Subject: [PATCH 107/108] Update custom-jobs-and-variables.yml Bump poodle allocation time to avoid timeouts --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 6593a4e357..62d7908945 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -25,7 +25,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle From 919aafde3fcee684ef06768c00fafa1de788e607 Mon Sep 17 00:00:00 2001 From: Rich Hornung Date: Mon, 6 May 2024 09:40:46 -0700 Subject: [PATCH 108/108] Update custom-jobs-and-variables.yml Bump poodle allocation time to prevent timeouts --- .gitlab/custom-jobs-and-variables.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab/custom-jobs-and-variables.yml b/.gitlab/custom-jobs-and-variables.yml index 6593a4e357..62d7908945 100644 --- a/.gitlab/custom-jobs-and-variables.yml +++ b/.gitlab/custom-jobs-and-variables.yml @@ -25,7 +25,7 @@ variables: # Poodle # Arguments for top level allocation - POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1" + POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1" # Arguments for job level allocation POODLE_JOB_ALLOC: "--nodes=1" # Project specific variants for poodle