Skip to content

Commit 9f6d349

Browse files
authored
Merge pull request #1625 from LLNL/feature/burmark1/reduction_tunings
Add more cuda/hip reducer tunings
2 parents c315ddd + 89004eb commit 9f6d349

26 files changed

+3636
-1339
lines changed

.gitlab/custom-jobs-and-variables.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ variables:
2525

2626
# Poodle
2727
# Arguments for top level allocation
28-
POODLE_SHARED_ALLOC: "--exclusive --time=60 --nodes=1"
28+
POODLE_SHARED_ALLOC: "--exclusive --time=90 --nodes=1"
2929
# Arguments for job level allocation
3030
POODLE_JOB_ALLOC: "--nodes=1"
3131
# Project specific variants for poodle
@@ -56,7 +56,7 @@ variables:
5656
# Lassen and Butte use a different job scheduler (spectrum lsf) that does not
5757
# allow pre-allocation the same way slurm does.
5858
# Arguments for job level allocation
59-
LASSEN_JOB_ALLOC: "1 -W 30 -q pci"
59+
LASSEN_JOB_ALLOC: "1 -W 40 -q pci"
6060
# Project specific variants for lassen
6161
PROJECT_LASSEN_VARIANTS: "~shared +openmp +vectorization +tests cuda_arch=70"
6262
# Project specific deps for lassen

docs/Licenses/rocprim-license.txt

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

docs/sphinx/user_guide/cook_book/reduction.rst

+22-5
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,21 @@ Here a simple sum reduction is performed in a for loop::
4646

4747
The results of these operations will yield the following values:
4848

49-
* vsum == 1000
49+
* ``vsum == 1000``
5050

5151
RAJA uses policy types to specify how things are implemented.
5252

5353
The forall *execution policy* specifies how the loop is run by the ``RAJA::forall`` method. The following discussion includes examples of several other RAJA execution policies that could be applied.
5454
For example ``RAJA::seq_exec`` runs a C-style for loop sequentially on a CPU. The
55-
``RAJA::cuda_exec_rec_for_reduce<256>`` runs the loop as a CUDA GPU kernel with
55+
``RAJA::cuda_exec_with_reduce<256>`` runs the loop as a CUDA GPU kernel with
5656
256 threads per block and other CUDA kernel launch parameters, like the
5757
number of blocks, optimized for performance with reducers.::
5858

5959
using exec_policy = RAJA::seq_exec;
6060
// using exec_policy = RAJA::omp_parallel_for_exec;
6161
// using exec_policy = RAJA::omp_target_parallel_for_exec<256>;
62-
// using exec_policy = RAJA::cuda_exec_rec_for_reduce<256>;
63-
// using exec_policy = RAJA::hip_exec_rec_for_reduce<256>;
62+
// using exec_policy = RAJA::cuda_exec_with_reduce<256>;
63+
// using exec_policy = RAJA::hip_exec_with_reduce<256>;
6464
// using exec_policy = RAJA::sycl_exec<256>;
6565

6666
The reduction policy specifies how the reduction is done and must match the
@@ -90,4 +90,21 @@ Here a simple sum reduction is performed using RAJA::
9090

9191
The results of these operations will yield the following values:
9292

93-
* vsum.get() == 1000
93+
* ``vsum.get() == 1000``
94+
95+
96+
Another option for the execution policy when using the cuda or hip backends are
97+
the base policies which have a boolean parameter to choose between the general
98+
use ``cuda/hip_exec`` policy and the ``cuda/hip_exec_with_reduce`` policy.::
99+
100+
// static constexpr bool with_reduce = ...;
101+
// using exec_policy = RAJA::cuda_exec_base<with_reduce, 256>;
102+
// using exec_policy = RAJA::hip_exec_base<with_reduce, 256>;
103+
104+
Another option for the reduction policy when using the cuda or hip backends are
105+
the base policies which have a boolean parameter to choose between the atomic
106+
``cuda/hip_reduce_atomic`` policy and the non-atomic ``cuda/hip_reduce`` policy.::
107+
108+
// static constexpr bool with_atomic = ...;
109+
// using reduce_policy = RAJA::cuda_reduce_base<with_atomic>;
110+
// using reduce_policy = RAJA::hip_reduce_base<with_atomic>;

docs/sphinx/user_guide/feature/policies.rst

+422-340
Large diffs are not rendered by default.

include/RAJA/RAJA.hpp

+6
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "RAJA/util/camp_aliases.hpp"
3434
#include "RAJA/util/macros.hpp"
3535
#include "RAJA/util/types.hpp"
36+
#include "RAJA/util/math.hpp"
3637
#include "RAJA/util/plugins.hpp"
3738
#include "RAJA/util/Registry.hpp"
3839
#include "RAJA/util/for_each.hpp"
@@ -156,6 +157,11 @@
156157
//
157158
#include "RAJA/util/sort.hpp"
158159

160+
//
161+
// reduce algorithms
162+
//
163+
#include "RAJA/util/reduce.hpp"
164+
159165
//
160166
// WorkPool, WorkGroup, WorkSite objects
161167
//

include/RAJA/policy/cuda/MemUtils_CUDA.hpp

+28-3
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ struct PinnedAllocator {
6161
return ptr;
6262
}
6363

64-
// returns true on success, false on failure
64+
// returns true on success, throws a run time error exception on failure
6565
bool free(void* ptr)
6666
{
6767
cudaErrchk(cudaFreeHost(ptr));
@@ -80,7 +80,7 @@ struct DeviceAllocator {
8080
return ptr;
8181
}
8282

83-
// returns true on success, false on failure
83+
// returns true on success, throws a run time error exception on failure
8484
bool free(void* ptr)
8585
{
8686
cudaErrchk(cudaFree(ptr));
@@ -103,7 +103,31 @@ struct DeviceZeroedAllocator {
103103
return ptr;
104104
}
105105

106-
// returns true on success, false on failure
106+
// returns true on success, throws a run time error exception on failure
107+
bool free(void* ptr)
108+
{
109+
cudaErrchk(cudaFree(ptr));
110+
return true;
111+
}
112+
};
113+
114+
//! Allocator for device pinned memory for use in basic_mempool
115+
struct DevicePinnedAllocator {
116+
117+
// returns a valid pointer on success, nullptr on failure
118+
void* malloc(size_t nbytes)
119+
{
120+
int device;
121+
cudaErrchk(cudaGetDevice(&device));
122+
void* ptr;
123+
cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
124+
cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
125+
cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
126+
127+
return ptr;
128+
}
129+
130+
// returns true on success, throws a run time error exception on failure
107131
bool free(void* ptr)
108132
{
109133
cudaErrchk(cudaFree(ptr));
@@ -114,6 +138,7 @@ struct DeviceZeroedAllocator {
114138
using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
115139
using device_zeroed_mempool_type =
116140
basic_mempool::MemPool<DeviceZeroedAllocator>;
141+
using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
117142
using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
118143

119144
namespace detail

0 commit comments

Comments
 (0)