Skip to content

Commit e11b358

Browse files
[SYCL][Reduction] Avoid implicit atomic64 requirements (#9070)
Some of the existing reduction strategies use atomic operations on partial reduction results. However, for reductions on 64-bit values this implicitly adds a requirement that the corresponding device supports aspect::atomic64. This commit adds additional logic to select different strategies based on this. Note; one of these naively chooses another strategy if the type is 64-bit without checking the support at runtime. Follow-up patches should refactor the strategy selection, allowing this to do appropriate runtime checks. --------- Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
1 parent 4447a50 commit e11b358

File tree

2 files changed

+88
-1
lines changed

2 files changed

+88
-1
lines changed

sycl/include/sycl/reduction.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2632,6 +2632,20 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
26322632
else
26332633
return Delegate(Impl<Strat::basic>{});
26342634
} else if constexpr (Reduction::has_fast_atomics) {
2635+
if constexpr (sizeof(typename Reduction::result_type) == 8) {
2636+
// Both group_reduce_and_atomic_cross_wg and
2637+
// local_mem_tree_and_atomic_cross_wg implicitly require
2638+
// aspect::atomic64 if the result type of the reduction is 64-bit. If
2639+
// the device does not support this, we need to fall back to more
2640+
// reliable strategies.
2641+
if (!getDeviceFromHandler(CGH).has(aspect::atomic64)) {
2642+
if constexpr (Reduction::has_fast_reduce)
2643+
return Delegate(Impl<Strat::group_reduce_and_multiple_kernels>{});
2644+
else
2645+
return Delegate(Impl<Strat::basic>{});
2646+
}
2647+
}
2648+
26352649
if constexpr (Reduction::has_fast_reduce) {
26362650
return Delegate(Impl<Strat::group_reduce_and_atomic_cross_wg>{});
26372651
} else {
@@ -2762,10 +2776,16 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
27622776
// specification. However, implementing run-time check for that would
27632777
// result in an extra kernel compilation(s). We probably need to
27642778
// investigate if the usage of kernel_bundles can mitigate that.
2779+
// TODO: local_atomic_and_atomic_cross_wg uses atomics on the partial
2780+
// results, which may add an implicit requirement on aspect::atomic64. As
2781+
// a temporary work-around we do not pick this if the result type is
2782+
// 64-bit. In the future this selection should be done at runtime based
2783+
// on the device.
27652784
// Note: Identityless reductions cannot use group reductions.
27662785
if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
27672786
return reduction::strategy::group_reduce_and_last_wg_detection;
2768-
else if constexpr (Reduction::has_fast_atomics)
2787+
else if constexpr (Reduction::has_fast_atomics &&
2788+
sizeof(typename Reduction::result_type) != 8)
27692789
return reduction::strategy::local_atomic_and_atomic_cross_wg;
27702790
else
27712791
return reduction::strategy::range_basic;
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
//
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
6+
7+
// Tests that a previously known case for reduction doesn't cause a requirement
8+
// for atomic64.
9+
// TODO: When aspect requirements are added to testing, this test could be set
10+
// to require that atomic64 is NOT supported, to limit how frequently the
11+
// test is run. However, it should work on devices that support atomic64
12+
// as well.
13+
14+
#include <sycl/sycl.hpp>
15+
16+
#include <iostream>
17+
18+
using namespace sycl;
19+
20+
int main() {
21+
queue Q;
22+
23+
if (Q.get_device().has(aspect::atomic64)) {
24+
std::cout << "Device supports aspect::atomic64 so we do not need to run "
25+
"the test."
26+
<< std::endl;
27+
return 0;
28+
}
29+
30+
long long *Out = malloc_shared<long long>(1, Q);
31+
32+
// Case 1: nd_range reduction with 64-bit integer and either sycl::plus,
33+
// sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy
34+
// would normally be picked, but if the device does not support atomic64 that
35+
// strategy is invalid.
36+
Q.submit([&](handler &CGH) {
37+
auto Redu = reduction(Out, 0ll, sycl::plus<long long>{});
38+
CGH.parallel_for(nd_range<1>{range<1>{32}, range<1>{32}}, Redu,
39+
[=](nd_item<1> It, auto &Sum) {
40+
Sum.combine(It.get_global_linear_id());
41+
});
42+
}).wait();
43+
44+
// Case 2: nd_range reduction with 64-bit integer and either sycl::bit_or,
45+
// sycl::bit_xor, sycl::bit_and. local_mem_tree_and_atomic_cross_wg strategy
46+
// would normally be picked, but if the device does not support atomic64 that
47+
// strategy is invalid.
48+
Q.submit([&](handler &CGH) {
49+
auto Redu = reduction(Out, 0ll, sycl::bit_and<long long>{});
50+
CGH.parallel_for(nd_range<1>{range<1>{32}, range<1>{32}}, Redu,
51+
[=](nd_item<1> It, auto &Sum) {
52+
Sum.combine(It.get_global_linear_id());
53+
});
54+
}).wait();
55+
56+
// Case 3: range reduction with 64-bit integer and either sycl::bit_or,
57+
// sycl::bit_xor, sycl::bit_and. local_atomic_and_atomic_cross_wg strategy
58+
// would normally be picked, but if the device does not support atomic64 that
59+
// strategy is invalid.
60+
Q.submit([&](handler &CGH) {
61+
auto Redu = reduction(Out, 0ll, sycl::bit_and<long long>{});
62+
CGH.parallel_for(range<1>{32}, Redu,
63+
[=](item<1> It, auto &Sum) { Sum.combine(It); });
64+
}).wait();
65+
sycl::free(Out, Q);
66+
return 0;
67+
}

0 commit comments

Comments
 (0)