Skip to content

Commit 593f756

Browse files
authored
Merge pull request #1638 from LLNL/v2024.02.2-RC
V2024.02.2 RC merge to main
2 parents 3ada095 + b33995f commit 593f756

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

74 files changed

+5284
-2033
lines changed

.gitlab-ci.yml

+4-2
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ stages:
7575
include:
7676
- local: '.gitlab/custom-jobs-and-variables.yml'
7777
- project: 'radiuss/radiuss-shared-ci'
78-
ref: 'v2023.12.3'
78+
ref: 'v2024.04.0'
7979
file: 'pipelines/${CI_MACHINE}.yml'
8080
- artifact: '${CI_MACHINE}-jobs.yml'
8181
job: 'generate-job-lists'
@@ -100,9 +100,11 @@ trigger-rajaperf:
100100
strategy: depend
101101

102102
include:
103+
- project: 'lc-templates/id_tokens'
104+
file: 'id_tokens.yml'
103105
# [Optional] checks preliminary to running the actual CI test
104106
- project: 'radiuss/radiuss-shared-ci'
105-
ref: 'v2023.12.3'
107+
ref: 'v2024.04.0'
106108
file: 'utilities/preliminary-ignore-draft-pr.yml'
107109
# pipelines subscribed by the project
108110
- local: '.gitlab/subscribed-pipelines.yml'

.gitlab/custom-jobs-and-variables.yml

+8-8
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@ variables:
1919
# Note: We repeat the reservation, necessary when jobs are manually re-triggered.
2020
RUBY_JOB_ALLOC: "--reservation=ci --nodes=1"
2121
# Project specific variants for ruby
22-
PROJECT_RUBY_VARIANTS: "~shared +openmp +tests"
22+
PROJECT_RUBY_VARIANTS: "~shared +openmp +vectorization +tests"
2323
# Project specific deps for ruby
2424
PROJECT_RUBY_DEPS: ""
2525

2626
# Poodle
2727
# Arguments for top level allocation
28-
POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1"
28+
POODLE_SHARED_ALLOC: "--exclusive --time=120 --nodes=1"
2929
# Arguments for job level allocation
3030
POODLE_JOB_ALLOC: "--nodes=1"
3131
# Project specific variants for poodle
32-
PROJECT_POODLE_VARIANTS: "~shared +openmp +tests"
32+
PROJECT_POODLE_VARIANTS: "~shared +openmp +vectorization +tests"
3333
# Project specific deps for poodle
3434
PROJECT_POODLE_DEPS: ""
3535

@@ -39,26 +39,26 @@ variables:
3939
# Arguments for job level allocation
4040
CORONA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
4141
# Project specific variants for corona
42-
PROJECT_CORONA_VARIANTS: "~shared ~openmp +tests"
42+
PROJECT_CORONA_VARIANTS: "~shared ~openmp +vectorization +tests"
4343
# Project specific deps for corona
4444
PROJECT_CORONA_DEPS: "^blt@develop "
4545

4646
# Tioga
4747
# Arguments for top level allocation
48-
TIOGA_SHARED_ALLOC: "--exclusive --time-limit=60m --nodes=1 -o per-resource.count=2"
48+
TIOGA_SHARED_ALLOC: "--exclusive --queue=pci --time-limit=60m --nodes=1 -o per-resource.count=2"
4949
# Arguments for job level allocation
5050
TIOGA_JOB_ALLOC: "--nodes=1 --begin-time=+5s"
5151
# Project specific variants for corona
52-
PROJECT_TIOGA_VARIANTS: "~shared ~openmp +tests"
52+
PROJECT_TIOGA_VARIANTS: "~shared +openmp +vectorization +tests"
5353
# Project specific deps for corona
5454
PROJECT_TIOGA_DEPS: "^blt@develop "
5555

5656
# Lassen and Butte use a different job scheduler (spectrum lsf) that does not
5757
# allow pre-allocation the same way slurm does.
5858
# Arguments for job level allocation
59-
LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
59+
LASSEN_JOB_ALLOC: "1 -W 40 -q pci"
6060
# Project specific variants for lassen
61-
PROJECT_LASSEN_VARIANTS: "~shared +openmp +tests cuda_arch=70"
61+
PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
6262
# Project specific deps for lassen
6363
PROJECT_LASSEN_DEPS: "^blt@develop "
6464

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ include(CMakeDependentOption)
1616
# Set version number
1717
set(RAJA_VERSION_MAJOR 2024)
1818
set(RAJA_VERSION_MINOR 02)
19-
set(RAJA_VERSION_PATCHLEVEL 1)
19+
set(RAJA_VERSION_PATCHLEVEL 2)
2020

2121
if (RAJA_LOADED AND (NOT RAJA_LOADED STREQUAL "${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}"))
2222
message(FATAL_ERROR "You are mixing RAJA versions. Loaded is ${RAJA_LOADED}, expected ${RAJA_VERSION_MAJOR}.${RAJA_VERSION_MINOR}.${RAJA_VERSION_PATCHLEVEL}")

RELEASE_NOTES.md

+33
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,39 @@ Notable changes include:
2020
* Bug fixes/improvements:
2121

2222

23+
Version 2024.02.2 -- Release date 2024-05-08
24+
============================================
25+
26+
This release contains a bugfix and new execution policies that improve
27+
performance for GPU kernels with reductions.
28+
29+
Notable changes include:
30+
31+
* New features / API changes:
32+
* New GPU execution policies for CUDA and HIP added which provide
33+
improved performance for GPU kernels with reductions. Please see the
34+
RAJA User Guide for more information. Short summary:
35+
* Option added to change max grid size in policies that use the
36+
occupancy calculator.
37+
* Policies added to run with max occupancy, a fraction of of the
38+
max occupancy, and to run with a "concretizer" which allows a
39+
user to determine how to run based on what the occupancy
40+
calculator determines about a kernel.
41+
* Additional options to tune kernels containing reductions, such as
42+
* an option to initialize data on host for reductions that use
43+
atomic operations
44+
* an option to avoid device scope memory fences
45+
* Change ordering of SYCL thread index ordering in RAJA::launch to
46+
follow the SYCL "row-major" convention. Please see RAJA User Guide
47+
for more information.
48+
49+
* Build changes/improvements:
50+
* NONE.
51+
52+
* Bug fixes/improvements:
53+
* Fixed issue in bump-style allocator used internally in RAJA::launch.
54+
55+
2356
Version 2024.02.1 -- Release date 2024-04-03
2457
============================================
2558

docs/Licenses/rocprim-license.txt

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
# The short X.Y version.
8989
version = u'2024.02'
9090
# The full version, including alpha/beta/rc tags.
91-
release = u'2024.02.1'
91+
release = u'2024.02.2'
9292

9393
# The language for content autogenerated by Sphinx. Refer to documentation
9494
# for a list of supported languages.

docs/sphinx/user_guide/cook_book.rst

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.. ##
2+
.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
3+
.. ## and RAJA project contributors. See the RAJA/LICENSE file
4+
.. ## for details.
5+
.. ##
6+
.. ## SPDX-License-Identifier: (BSD-3-Clause)
7+
.. ##
8+
9+
.. _cook-book-label:
10+
11+
************************
12+
RAJA Cook Book
13+
************************
14+
15+
The following sections show common use case patterns and the recommended
16+
RAJA features and policies to use with them. They are intended
17+
to provide users with complete beyond usage examples beyond what can be found in other parts of the RAJA User Guide. In particular, the examples and discussion provide guidance on RAJA execution policy selection to improve performance of user application codes.
18+
19+
.. toctree::
20+
:maxdepth: 2
21+
22+
cook_book/reduction
23+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
.. ##
2+
.. ## Copyright (c) 2016-24, Lawrence Livermore National Security, LLC
3+
.. ## and other RAJA project contributors. See the RAJA/LICENSE file
4+
.. ## for details.
5+
.. ##
6+
.. ## SPDX-License-Identifier: (BSD-3-Clause)
7+
.. ##
8+
9+
.. _cook-book-reductions-label:
10+
11+
=======================
12+
Cooking with Reductions
13+
=======================
14+
15+
Please see the following section for overview discussion about RAJA reductions:
16+
17+
* :ref:`feat-reductions-label`.
18+
19+
20+
----------------------------
21+
Reductions with RAJA::forall
22+
----------------------------
23+
24+
Here is the setup for a simple reduction example::
25+
26+
const int N = 1000;
27+
28+
int vec[N];
29+
30+
for (int i = 0; i < N; ++i) {
31+
32+
vec[i] = 1;
33+
34+
}
35+
36+
Here a simple sum reduction is performed in a for loop::
37+
38+
int vsum = 0;
39+
40+
// Run a kernel using the reduction objects
41+
for (int i = 0; i < N; ++i) {
42+
43+
vsum += vec[i];
44+
45+
}
46+
47+
The results of these operations will yield the following values:
48+
49+
* ``vsum == 1000``
50+
51+
RAJA uses policy types to specify how things are implemented.
52+
53+
The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
54+
For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
55+
``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with
56+
256 threads per block and other CUDA kernel launch parameters, like the
57+
number of blocks, optimized for performance with reducers.::
58+
59+
using exec_policy = RAJA::seq_exec;
60+
// using exec_policy = RAJA::omp_parallel_for_exec;
61+
// using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
62+
// using exec_policy = RAJA::cuda_exec_with_reduce<256>;
63+
// using exec_policy = RAJA::hip_exec_with_reduce<256>;
64+
// using exec_policy = RAJA::sycl_exec<256>;
65+
66+
The reduction policy specifies how the reduction is done and must match the
67+
execution policy. For example ``RAJA::seq_reduce`` does a sequential reduction
68+
and can only be used with sequential execution policies. The
69+
``RAJA::cuda_reduce_atomic`` policy uses atomics, if possible with the given
70+
data type, and can only be used with cuda execution policies. Similarly for other RAJA execution back-ends, such as HIP and OpenMP. Here are example RAJA reduction policies whose names are indicative of which execution policies they work with::
71+
72+
using reduce_policy = RAJA::seq_reduce;
73+
// using reduce_policy = RAJA::omp_reduce;
74+
// using reduce_policy = RAJA::omp_target_reduce;
75+
// using reduce_policy = RAJA::cuda_reduce_atomic;
76+
// using reduce_policy = RAJA::hip_reduce_atomic;
77+
// using reduce_policy = RAJA::sycl_reduce;
78+
79+
80+
Here a simple sum reduction is performed using RAJA::
81+
82+
RAJA::ReduceSum<reduce_policy, int> vsum(0);
83+
84+
RAJA::forall<exec_policy>( RAJA::RangeSegment(0, N),
85+
[=](RAJA::Index_type i) {
86+
87+
vsum += vec[i];
88+
89+
});
90+
91+
The results of these operations will yield the following values:
92+
93+
* ``vsum.get() == 1000``
94+
95+
96+
Another option for the execution policy when using the cuda or hip backends are
97+
the base policies which have a boolean parameter to choose between the general
98+
use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
99+
100+
// static constexpr bool with_reduce = ...;
101+
// using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
102+
// using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
103+
104+
Another option for the reduction policy when using the cuda or hip backends are
105+
the base policies which have a boolean parameter to choose between the atomic
106+
``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
107+
108+
// static constexpr bool with_atomic = ...;
109+
// using reduce_policy = RAJA::cuda_reduce_base<with_atomic>;
110+
// using reduce_policy = RAJA::hip_reduce_base<with_atomic>;

0 commit comments

Comments
 (0)