Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add reduction queue shortcut tests #1010

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 210 additions & 0 deletions SYCL/Reduction/reduction_queue_shortcuts.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// This test checks that the parallel_for shortcuts on queue taking reduction
// operations behave like their regular counterparts.

#include <CL/sycl.hpp>

constexpr size_t LocalSize = 32;
constexpr size_t GlobalSize = 10 * LocalSize;

// Tests for reduction shortcuts without dependency events.
void compare_handler_and_shortcut_tests(sycl::queue &Q) {
sycl::range<1> GlobalRange{GlobalSize};
sycl::range<1> LocalRange{LocalSize};
sycl::nd_range<1> NDRange{GlobalRange, LocalRange};

int *InputData = sycl::malloc_shared<int>(GlobalSize, Q);
int *ShortcutOutputData = sycl::malloc_shared<int>(3, Q);
int *RegularOutputData = sycl::malloc_shared<int>(3, Q);

for (int I = 0; I < GlobalSize; ++I)
InputData[I] = I;

auto ReduF = [=](sycl::id<1> ID, auto &Sum) { Sum.combine(InputData[ID]); };

// Only parallel_for with range and and a single reduction operation is
// currently supported.
std::memset(ShortcutOutputData, 0, sizeof(int));
std::memset(RegularOutputData, 0, sizeof(int));
Q.parallel_for(GlobalRange,
sycl::reduction(ShortcutOutputData, std::plus<int>()), ReduF);
Q.submit([&](sycl::handler &CGH) {
CGH.parallel_for(GlobalRange,
sycl::reduction(RegularOutputData, std::plus<int>()),
ReduF);
});
Q.wait();
assert(ShortcutOutputData[0] != 0);
assert(ShortcutOutputData[0] == RegularOutputData[0]);

auto NDReduF1 = [=](sycl::nd_item<1> NDIt, auto &Sum1) {
Sum1.combine(InputData[NDIt.get_global_id()]);
};

std::memset(ShortcutOutputData, 0, sizeof(int));
std::memset(RegularOutputData, 0, sizeof(int));
Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
NDReduF1);
Q.submit([&](sycl::handler &CGH) {
CGH.parallel_for(NDRange,
sycl::reduction(RegularOutputData, std::plus<int>()),
NDReduF1);
});
Q.wait();
assert(ShortcutOutputData[0] != 0);
assert(ShortcutOutputData[0] == RegularOutputData[0]);

auto NDReduF2 = [=](sycl::nd_item<1> NDIt, auto &Sum1, auto &Sum2) {
Sum1.combine(InputData[NDIt.get_global_id()]);
Sum2.combine(InputData[NDIt.get_global_id()] + 1);
};

std::memset(ShortcutOutputData, 0, sizeof(int) * 2);
std::memset(RegularOutputData, 0, sizeof(int) * 2);
Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
sycl::reduction(ShortcutOutputData + 1, std::plus<int>()),
NDReduF2);
Q.submit([&](sycl::handler &CGH) {
CGH.parallel_for(
NDRange, sycl::reduction(RegularOutputData, std::plus<int>()),
sycl::reduction(RegularOutputData + 1, std::plus<int>()), NDReduF2);
});
Q.wait();
assert(ShortcutOutputData[0] != 0);
assert(ShortcutOutputData[1] != 0);
assert(ShortcutOutputData[0] == RegularOutputData[0]);
assert(ShortcutOutputData[1] == RegularOutputData[1]);

auto NDReduF3 = [=](sycl::nd_item<1> NDIt, auto &Sum1, auto &Sum2,
auto &Sum3) {
Sum1.combine(InputData[NDIt.get_global_id()]);
Sum2.combine(InputData[NDIt.get_global_id()] + 1);
Sum3.combine(InputData[NDIt.get_global_id()] * 2);
};

std::memset(ShortcutOutputData, 0, sizeof(int) * 3);
std::memset(RegularOutputData, 0, sizeof(int) * 3);
Q.parallel_for(NDRange, sycl::reduction(ShortcutOutputData, std::plus<int>()),
sycl::reduction(ShortcutOutputData + 1, std::plus<int>()),
sycl::reduction(ShortcutOutputData + 2, std::plus<int>()),
NDReduF3);
Q.submit([&](sycl::handler &CGH) {
CGH.parallel_for(
NDRange, sycl::reduction(RegularOutputData, std::plus<int>()),
sycl::reduction(RegularOutputData + 1, std::plus<int>()),
sycl::reduction(RegularOutputData + 2, std::plus<int>()), NDReduF3);
});
Q.wait();
assert(ShortcutOutputData[0] != 0);
assert(ShortcutOutputData[1] != 0);
assert(ShortcutOutputData[2] != 0);
assert(ShortcutOutputData[0] == RegularOutputData[0]);
assert(ShortcutOutputData[1] == RegularOutputData[1]);
assert(ShortcutOutputData[2] == RegularOutputData[2]);

sycl::free(InputData, Q);
sycl::free(ShortcutOutputData, Q);
sycl::free(RegularOutputData, Q);
}

// Tests for reduction shortcuts taking a single dependency event.
void dep_event_tests(sycl::queue &Q) {
sycl::range<1> GlobalRange{GlobalSize};
sycl::range<1> LocalRange{LocalSize};
sycl::nd_range<1> NDRange{GlobalRange, LocalRange};

int *InputData = sycl::malloc_device<int>(GlobalSize, Q);
int *ShortcutOutputData = sycl::malloc_shared<int>(1, Q);

int InputDataHost[GlobalSize];
int ExpectedOutput = 0;
for (int I = 0; I < GlobalSize; ++I) {
InputDataHost[I] = I;
ExpectedOutput += I;
}

Q.fill<int>(InputData, 0, GlobalSize).wait();
std::memset(ShortcutOutputData, 0, sizeof(int));
sycl::event CopyEvent = Q.copy<int>(InputDataHost, InputData, GlobalSize);
Q.parallel_for(
GlobalRange, CopyEvent,
sycl::reduction(ShortcutOutputData, std::plus<int>()),
[=](sycl::id<1> ID, auto &Sum) { Sum.combine(InputData[ID]); });
Q.wait();
assert(*ShortcutOutputData == ExpectedOutput);

Q.fill<int>(InputData, 0, GlobalSize).wait();
std::memset(ShortcutOutputData, 0, sizeof(int));
CopyEvent = Q.copy<int>(InputDataHost, InputData, GlobalSize);
Q.parallel_for(NDRange, CopyEvent,
sycl::reduction(ShortcutOutputData, std::plus<int>()),
[=](sycl::nd_item<1> NDIt, auto &Sum1) {
Sum1.combine(InputData[NDIt.get_global_id()]);
});
Q.wait();
assert(*ShortcutOutputData == ExpectedOutput);

sycl::free(InputData, Q);
sycl::free(ShortcutOutputData, Q);
}

// Tests for reduction shortcuts taking multiple dependency events.
void dep_events_tests(sycl::queue &Q) {
sycl::range<1> GlobalRange{GlobalSize};
sycl::range<1> LocalRange{LocalSize};
sycl::nd_range<1> NDRange{GlobalRange, LocalRange};

int *InputData1 = sycl::malloc_device<int>(GlobalSize, Q);
int *InputData2 = sycl::malloc_device<int>(GlobalSize, Q);
int *ShortcutOutputData = sycl::malloc_shared<int>(1, Q);

int InputDataHost[GlobalSize];
int ExpectedOutput = 0;
for (int I = 0; I < GlobalSize; ++I) {
InputDataHost[I] = I;
ExpectedOutput += I;
}
ExpectedOutput *= 2;

Q.fill<int>(InputData1, 0, GlobalSize);
Q.fill<int>(InputData2, 0, GlobalSize);
Q.wait();
std::memset(ShortcutOutputData, 0, sizeof(int));
sycl::event CopyEvent1 = Q.copy<int>(InputDataHost, InputData1, GlobalSize);
sycl::event CopyEvent2 = Q.copy<int>(InputDataHost, InputData2, GlobalSize);
Q.parallel_for(GlobalRange, std::vector{CopyEvent1, CopyEvent2},
sycl::reduction(ShortcutOutputData, std::plus<int>()),
[=](sycl::id<1> ID, auto &Sum) {
Sum.combine(InputData1[ID] + InputData2[ID]);
});
Q.wait();
assert(*ShortcutOutputData == ExpectedOutput);

Q.fill<int>(InputData1, 0, GlobalSize).wait();
std::memset(ShortcutOutputData, 0, sizeof(int));
CopyEvent1 = Q.copy<int>(InputDataHost, InputData1, GlobalSize);
CopyEvent2 = Q.copy<int>(InputDataHost, InputData2, GlobalSize);
Q.parallel_for(NDRange, std::vector{CopyEvent1, CopyEvent2},
sycl::reduction(ShortcutOutputData, std::plus<int>()),
[=](sycl::nd_item<1> NDIt, auto &Sum1) {
Sum1.combine(InputData1[NDIt.get_global_id()] +
InputData2[NDIt.get_global_id()]);
});
Q.wait();
assert(*ShortcutOutputData == ExpectedOutput);

sycl::free(InputData1, Q);
sycl::free(InputData2, Q);
sycl::free(ShortcutOutputData, Q);
}

int main() {
sycl::queue Q;
compare_handler_and_shortcut_tests(Q);
dep_event_tests(Q);
dep_events_tests(Q);
return 0;
}