[SYCL][Reduction] Avoid implicit atomic64 requirements (#9070)

steffenlarsen · web-flow · commit e11b35835722 · 2023-04-17T11:44:30.000+01:00
Some of the existing reduction strategies use atomic operations on
partial reduction results. However, for reductions on 64-bit values this
implicitly adds a requirement that the corresponding device supports
aspect::atomic64. This commit adds additional logic to select different
strategies based on this.
Note; one of these naively chooses another strategy if the type is
64-bit without checking the support at runtime. Follow-up patches should
refactor the strategy selection, allowing this to do appropriate runtime
checks.

---------

Signed-off-by: Larsen, Steffen &lt;steffen.larsen@intel.com&gt;
diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp
@@ -2632,6 +2632,20 @@ template <> struct NDRangeReduction<reduction::strategy::auto_select> {
       else
         return Delegate(Impl<Strat::basic>{});
     } else if constexpr (Reduction::has_fast_atomics) {
+      if constexpr (sizeof(typename Reduction::result_type) == 8) {
+        // Both group_reduce_and_atomic_cross_wg and
+        // local_mem_tree_and_atomic_cross_wg implicitly require
+        // aspect::atomic64 if the result type of the reduction is 64-bit. If
+        // the device does not support this, we need to fall back to more
+        // reliable strategies.
+        if (!getDeviceFromHandler(CGH).has(aspect::atomic64)) {
+          if constexpr (Reduction::has_fast_reduce)
+            return Delegate(Impl<Strat::group_reduce_and_multiple_kernels>{});
+          else
+            return Delegate(Impl<Strat::basic>{});
+        }
+      }
+
       if constexpr (Reduction::has_fast_reduce) {
         return Delegate(Impl<Strat::group_reduce_and_atomic_cross_wg>{});
       } else {
@@ -2762,10 +2776,16 @@ void reduction_parallel_for(handler &CGH, range<Dims> Range,
       // specification. However, implementing run-time check for that would
       // result in an extra kernel compilation(s). We probably need to
       // investigate if the usage of kernel_bundles can mitigate that.
+      // TODO: local_atomic_and_atomic_cross_wg uses atomics on the partial
+      // results, which may add an implicit requirement on aspect::atomic64. As
+      // a temporary work-around we do not pick this if the result type is
+      // 64-bit. In the future this selection should be done at runtime based
+      // on the device.
       // Note: Identityless reductions cannot use group reductions.
       if constexpr (Reduction::has_fast_reduce && Reduction::has_identity)
         return reduction::strategy::group_reduce_and_last_wg_detection;
-      else if constexpr (Reduction::has_fast_atomics)
+      else if constexpr (Reduction::has_fast_atomics &&
+                         sizeof(typename Reduction::result_type) != 8)
         return reduction::strategy::local_atomic_and_atomic_cross_wg;
       else
         return reduction::strategy::range_basic;
diff --git a/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp b/sycl/test-e2e/Regression/reduction_64bit_atomic64.cpp
@@ -0,0 +1,67 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+//
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// Tests that a previously known case for reduction doesn't cause a requirement
+// for atomic64.
+// TODO: When aspect requirements are added to testing, this test could be set
+//       to require that atomic64 is NOT supported, to limit how frequently the
+//       test is run. However, it should work on devices that support atomic64
+//       as well.
+
+#include <sycl/sycl.hpp>
+
+#include <iostream>
+
+using namespace sycl;
+
+int main() {
+  queue Q;
+
+  if (Q.get_device().has(aspect::atomic64)) {
+    std::cout << "Device supports aspect::atomic64 so we do not need to run "
+                 "the test."
+              << std::endl;
+    return 0;
+  }
+
+  long long *Out = malloc_shared<long long>(1, Q);
+
+  // Case 1: nd_range reduction with 64-bit integer and either sycl::plus,
+  // sycl::minimum or sycl::maximum. group_reduce_and_atomic_cross_wg strategy
+  // would normally be picked, but if the device does not support atomic64 that
+  // strategy is invalid.
+  Q.submit([&](handler &CGH) {
+     auto Redu = reduction(Out, 0ll, sycl::plus<long long>{});
+     CGH.parallel_for(nd_range<1>{range<1>{32}, range<1>{32}}, Redu,
+                      [=](nd_item<1> It, auto &Sum) {
+                        Sum.combine(It.get_global_linear_id());
+                      });
+   }).wait();
+
+  // Case 2: nd_range reduction with 64-bit integer and either sycl::bit_or,
+  // sycl::bit_xor, sycl::bit_and. local_mem_tree_and_atomic_cross_wg strategy
+  // would normally be picked, but if the device does not support atomic64 that
+  // strategy is invalid.
+  Q.submit([&](handler &CGH) {
+     auto Redu = reduction(Out, 0ll, sycl::bit_and<long long>{});
+     CGH.parallel_for(nd_range<1>{range<1>{32}, range<1>{32}}, Redu,
+                      [=](nd_item<1> It, auto &Sum) {
+                        Sum.combine(It.get_global_linear_id());
+                      });
+   }).wait();
+
+  // Case 3: range reduction with 64-bit integer and either sycl::bit_or,
+  // sycl::bit_xor, sycl::bit_and. local_atomic_and_atomic_cross_wg strategy
+  // would normally be picked, but if the device does not support atomic64 that
+  // strategy is invalid.
+  Q.submit([&](handler &CGH) {
+     auto Redu = reduction(Out, 0ll, sycl::bit_and<long long>{});
+     CGH.parallel_for(range<1>{32}, Redu,
+                      [=](item<1> It, auto &Sum) { Sum.combine(It); });
+   }).wait();
+  sycl::free(Out, Q);
+  return 0;
+}