intel · steffenlarsen · Apr 27, 2022
@@ -0,0 +1,210 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test checks that the parallel_for shortcuts on queue taking reduction
+// operations behave like their regular counterparts.
+
+#include <CL/sycl.hpp>
+
+constexpr size_t LocalSize = 32;
+constexpr size_t GlobalSize = 10 * LocalSize;
+
+// Tests for reduction shortcuts without dependency events.
+void compare_handler_and_shortcut_tests(sycl::queue &Q) {
+  sycl::range<1> GlobalRange{GlobalSize};
+  sycl::range<1> LocalRange{LocalSize};
+  sycl::nd_range<1> NDRange{GlobalRange, LocalRange};
+
+  int *InputData = sycl::malloc_shared<int>(GlobalSize, Q);
+  int *ShortcutOutputData = sycl::malloc_shared<int>(3, Q);
+  int *RegularOutputData = sycl::malloc_shared<int>(3, Q);
+
+  for (int I = 0; I < GlobalSize; ++I)
+    InputData[I] = I;
+
+  auto ReduF = [=](sycl::id<1> ID, auto &Sum) { Sum.combine(InputData[ID]); };
+
+  // Only parallel_for with range and and a single reduction operation is
+  // currently supported.
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  std::memset(RegularOutputData, 0, sizeof(int));
+  Q.parallel_for(GlobalRange,
+                 sycl::reduction(ShortcutOutputData, std::plus<int>()), ReduF);
+  Q.submit([&](sycl::handler &CGH) {
+    CGH.parallel_for(GlobalRange,
+                     sycl::reduction(RegularOutputData, std::plus<int>()),
+                     ReduF);
+  });
+  Q.wait();
+  assert(ShortcutOutputData[0] != 0);
+  assert(ShortcutOutputData[0] == RegularOutputData[0]);
+
+  auto NDReduF1 = [=](sycl::nd_item<1> NDIt, auto &Sum1) {
+    Sum1.combine(InputData[NDIt.get_global_id()]);
+  };
+
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  std::memset(RegularOutputData, 0, sizeof(int));
+  Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 NDReduF1);
+  Q.submit([&](sycl::handler &CGH) {
+    CGH.parallel_for(NDRange,
+                     sycl::reduction(RegularOutputData, std::plus<int>()),
+                     NDReduF1);
+  });
+  Q.wait();
+  assert(ShortcutOutputData[0] != 0);
+  assert(ShortcutOutputData[0] == RegularOutputData[0]);
+
+  auto NDReduF2 = [=](sycl::nd_item<1> NDIt, auto &Sum1, auto &Sum2) {
+    Sum1.combine(InputData[NDIt.get_global_id()]);
+    Sum2.combine(InputData[NDIt.get_global_id()] + 1);
+  };
+
+  std::memset(ShortcutOutputData, 0, sizeof(int) * 2);
+  std::memset(RegularOutputData, 0, sizeof(int) * 2);
+  Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 sycl::reduction(ShortcutOutputData + 1, std::plus<int>()),
+                 NDReduF2);
+  Q.submit([&](sycl::handler &CGH) {
+    CGH.parallel_for(
+        NDRange, sycl::reduction(RegularOutputData, std::plus<int>()),
+        sycl::reduction(RegularOutputData + 1, std::plus<int>()), NDReduF2);
+  });
+  Q.wait();
+  assert(ShortcutOutputData[0] != 0);
+  assert(ShortcutOutputData[1] != 0);
+  assert(ShortcutOutputData[0] == RegularOutputData[0]);
+  assert(ShortcutOutputData[1] == RegularOutputData[1]);
+
+  auto NDReduF3 = [=](sycl::nd_item<1> NDIt, auto &Sum1, auto &Sum2,
+                      auto &Sum3) {
+    Sum1.combine(InputData[NDIt.get_global_id()]);
+    Sum2.combine(InputData[NDIt.get_global_id()] + 1);
+    Sum3.combine(InputData[NDIt.get_global_id()] * 2);
+  };
+
+  std::memset(ShortcutOutputData, 0, sizeof(int) * 3);
+  std::memset(RegularOutputData, 0, sizeof(int) * 3);
+  Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 sycl::reduction(ShortcutOutputData + 1, std::plus<int>()),
+                 sycl::reduction(ShortcutOutputData + 2, std::plus<int>()),
+                 NDReduF3);
+  Q.submit([&](sycl::handler &CGH) {
+    CGH.parallel_for(
+        NDRange, sycl::reduction(RegularOutputData, std::plus<int>()),
+        sycl::reduction(RegularOutputData + 1, std::plus<int>()),
+        sycl::reduction(RegularOutputData + 2, std::plus<int>()), NDReduF3);
+  });
+  Q.wait();
+  assert(ShortcutOutputData[0] != 0);
+  assert(ShortcutOutputData[1] != 0);
+  assert(ShortcutOutputData[2] != 0);
+  assert(ShortcutOutputData[0] == RegularOutputData[0]);
+  assert(ShortcutOutputData[1] == RegularOutputData[1]);
+  assert(ShortcutOutputData[2] == RegularOutputData[2]);
+
+  sycl::free(InputData, Q);
+  sycl::free(ShortcutOutputData, Q);
+  sycl::free(RegularOutputData, Q);
+}
+
+// Tests for reduction shortcuts taking a single dependency event.
+void dep_event_tests(sycl::queue &Q) {
+  sycl::range<1> GlobalRange{GlobalSize};
+  sycl::range<1> LocalRange{LocalSize};
+  sycl::nd_range<1> NDRange{GlobalRange, LocalRange};
+
+  int *InputData = sycl::malloc_device<int>(GlobalSize, Q);
+  int *ShortcutOutputData = sycl::malloc_shared<int>(1, Q);
+
+  int InputDataHost[GlobalSize];
+  int ExpectedOutput = 0;
+  for (int I = 0; I < GlobalSize; ++I) {
+    InputDataHost[I] = I;
+    ExpectedOutput += I;
+  }
+
+  Q.fill<int>(InputData, 0, GlobalSize).wait();
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  sycl::event CopyEvent = Q.copy<int>(InputDataHost, InputData, GlobalSize);
+  Q.parallel_for(
+      GlobalRange, CopyEvent,
+      sycl::reduction(ShortcutOutputData, std::plus<int>()),
+      [=](sycl::id<1> ID, auto &Sum) { Sum.combine(InputData[ID]); });
+  Q.wait();
+  assert(*ShortcutOutputData == ExpectedOutput);
+
+  Q.fill<int>(InputData, 0, GlobalSize).wait();
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  CopyEvent = Q.copy<int>(InputDataHost, InputData, GlobalSize);
+  Q.parallel_for(NDRange, CopyEvent,
+                 sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 [=](sycl::nd_item<1> NDIt, auto &Sum1) {
+                   Sum1.combine(InputData[NDIt.get_global_id()]);
+                 });
+  Q.wait();
+  assert(*ShortcutOutputData == ExpectedOutput);
+
+  sycl::free(InputData, Q);
+  sycl::free(ShortcutOutputData, Q);
+}
+
+// Tests for reduction shortcuts taking multiple dependency events.
+void dep_events_tests(sycl::queue &Q) {
+  sycl::range<1> GlobalRange{GlobalSize};
+  sycl::range<1> LocalRange{LocalSize};
+  sycl::nd_range<1> NDRange{GlobalRange, LocalRange};
+
+  int *InputData1 = sycl::malloc_device<int>(GlobalSize, Q);
+  int *InputData2 = sycl::malloc_device<int>(GlobalSize, Q);
+  int *ShortcutOutputData = sycl::malloc_shared<int>(1, Q);
+
+  int InputDataHost[GlobalSize];
+  int ExpectedOutput = 0;
+  for (int I = 0; I < GlobalSize; ++I) {
+    InputDataHost[I] = I;
+    ExpectedOutput += I;
+  }
+  ExpectedOutput *= 2;
+
+  Q.fill<int>(InputData1, 0, GlobalSize);
+  Q.fill<int>(InputData2, 0, GlobalSize);
+  Q.wait();
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  sycl::event CopyEvent1 = Q.copy<int>(InputDataHost, InputData1, GlobalSize);
+  sycl::event CopyEvent2 = Q.copy<int>(InputDataHost, InputData2, GlobalSize);
+  Q.parallel_for(GlobalRange, std::vector{CopyEvent1, CopyEvent2},
+                 sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 [=](sycl::id<1> ID, auto &Sum) {
+                   Sum.combine(InputData1[ID] + InputData2[ID]);
+                 });
+  Q.wait();
+  assert(*ShortcutOutputData == ExpectedOutput);
+
+  Q.fill<int>(InputData1, 0, GlobalSize).wait();
+  std::memset(ShortcutOutputData, 0, sizeof(int));
+  CopyEvent1 = Q.copy<int>(InputDataHost, InputData1, GlobalSize);
+  CopyEvent2 = Q.copy<int>(InputDataHost, InputData2, GlobalSize);
+  Q.parallel_for(NDRange, std::vector{CopyEvent1, CopyEvent2},
+                 sycl::reduction(ShortcutOutputData, std::plus<int>()),
+                 [=](sycl::nd_item<1> NDIt, auto &Sum1) {
+                   Sum1.combine(InputData1[NDIt.get_global_id()] +
+                                InputData2[NDIt.get_global_id()]);
+                 });
+  Q.wait();
+  assert(*ShortcutOutputData == ExpectedOutput);
+
+  sycl::free(InputData1, Q);
+  sycl::free(InputData2, Q);
+  sycl::free(ShortcutOutputData, Q);
+}
+
+int main() {
+  sycl::queue Q;
+  compare_handler_and_shortcut_tests(Q);
+  dep_event_tests(Q);
+  dep_events_tests(Q);
+  return 0;
+}