Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Fix types and transparent functors recognition in reduction #1709

Merged
merged 3 commits into from
May 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ class __SYCL_EXPORT handler {
int Dims, typename Reduction>
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
if (Reduction::is_usm)
Redu.associateWithHandler(*this);
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
Expand Down Expand Up @@ -886,7 +886,7 @@ class __SYCL_EXPORT handler {
int Dims, typename Reduction>
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
Expand Down Expand Up @@ -920,7 +920,7 @@ class __SYCL_EXPORT handler {
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
size_t NWorkGroups = Range.get_group_range().size();

// This parallel_for() is lowered to the following sequence:
Expand Down
227 changes: 147 additions & 80 deletions sycl/include/CL/sycl/intel/reduction.hpp

Large diffs are not rendered by default.

45 changes: 5 additions & 40 deletions sycl/test/reduction/reduction_ctor.cpp
Original file line number Diff line number Diff line change
@@ -1,48 +1,15 @@
// RUN: %clangxx -fsycl %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// This performs basic checks such as reduction creation, getIdentity() method,
// and the combine() method of the aux class 'reducer'.

#include "reduction_utils.hpp"
#include <CL/sycl.hpp>
#include <cassert>

using namespace cl::sycl;

template <typename T, class BinaryOperation, int N>
struct init_data_t {
void initInputData(T IdentityVal,
buffer<T, 1> &InBuf,
T &ExpectedReduValue) {
ExpectedReduValue = IdentityVal;
BinaryOperation Op;
auto In = InBuf.template get_access<access::mode::write>();
for (int I = 0; I < N; ++I) {
In[I] = ((I + 1) % 5) + 1;
ExpectedReduValue = Op(ExpectedReduValue, In[I]);
}
}
};

template <typename T, int N>
struct init_data_t<T, std::multiplies<T>, N> {
void initInputData(T IdentityVal, buffer<T, 1> &InBuf, T &ExpectedReduValue) {
ExpectedReduValue = IdentityVal;
std::multiplies<T> Op;
auto In = InBuf.template get_access<access::mode::write>();
for (int I = 0; I < N; ++I) {
In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
ExpectedReduValue = Op(ExpectedReduValue, In[I]);
}
}
};

template <typename T, typename Reduction>
void test_reducer(Reduction &Redu, T A, T B) {
Expand Down Expand Up @@ -157,15 +124,13 @@ int main() {
testBoth<int, intel::bit_or<int>>(0, 1, 8);
testBoth<int, intel::bit_xor<int>>(0, 7, 3);
testBoth<int, intel::bit_and<int>>(~0, 7, 3);
testBoth<int, intel::minimum<int>>(std::numeric_limits<int>::max(), 7, 3);
testBoth<int, intel::maximum<int>>(std::numeric_limits<int>::min(), 7, 3);
testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);

testBoth<float, intel::plus<float>>(0, 1, 7);
testBoth<float, std::multiplies<float>>(1, 1, 7);
testBoth<float, intel::minimum<float>>(
std::numeric_limits<float>::max(), 7, 3);
testBoth<float, intel::maximum<float>>(
std::numeric_limits<float>::min(), 7, 3);
testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);

testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
Expand Down
12 changes: 6 additions & 6 deletions sycl/test/reduction/reduction_nd_s0_dw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,17 @@ int main() {
test<int, 0, intel::bit_or<int>>(0, 8, 256);
test<int, 0, intel::bit_xor<int>>(0, 8, 256);
test<int, 0, intel::bit_and<int>>(~0, 8, 256);
test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);

// Check with various types.
test<float, 0, std::multiplies<float>>(1, 8, 256);
test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);

test<double, 0, std::multiplies<double>>(1, 8, 256);
test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);

// Check with CUSTOM type.
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
Expand Down
12 changes: 6 additions & 6 deletions sycl/test/reduction/reduction_nd_s0_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,17 +81,17 @@ int main() {
test<int, 0, intel::bit_or<int>>(0, 8, 256);
test<int, 0, intel::bit_xor<int>>(0, 8, 256);
test<int, 0, intel::bit_and<int>>(~0, 8, 256);
test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);

// Check with various types.
test<float, 0, std::multiplies<float>>(1, 8, 256);
test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);

test<double, 0, std::multiplies<double>>(1, 8, 256);
test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);

// Check with CUSTOM type.
test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
Expand Down
12 changes: 6 additions & 6 deletions sycl/test/reduction/reduction_nd_s1_dw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,17 @@ int main() {
test<int, 1, intel::bit_or<int>>(0, 8, 256);
test<int, 1, intel::bit_xor<int>>(0, 8, 256);
test<int, 1, intel::bit_and<int>>(~0, 8, 256);
test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);

// Check with various types.
test<float, 1, std::multiplies<float>>(1, 8, 256);
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);

test<double, 1, std::multiplies<double>>(1, 8, 256);
test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);

// Check with CUSTOM type.
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
Expand Down
12 changes: 6 additions & 6 deletions sycl/test/reduction/reduction_nd_s1_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,17 @@ int main() {
test<int, 1, intel::bit_or<int>>(0, 8, 256);
test<int, 1, intel::bit_xor<int>>(0, 8, 256);
test<int, 1, intel::bit_and<int>>(~0, 8, 256);
test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);

// Check with various types.
test<float, 1, std::multiplies<float>>(1, 8, 256);
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);

test<double, 1, std::multiplies<double>>(1, 8, 256);
test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);

// Check with CUSTOM type.
test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
Expand Down
4 changes: 2 additions & 2 deletions sycl/test/reduction/reduction_placeholder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ int main() {
test<int, 1, intel::bit_or<int>>(0, 4, 128);

// fast reduce
test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 5, 5 * 7);
test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 4, 128);
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);

// generic algorithm
test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
Expand Down
82 changes: 72 additions & 10 deletions sycl/test/reduction/reduction_transparent.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
// UNSUPPORTED: cuda
// Reductions use work-group builtins not yet supported by CUDA.

// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
// for 'cl::sycl::half' type.
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out

// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()

// This test performs basic checks of parallel_for(nd_range, reduction, func)
// where func is a transparent functor.
Expand All @@ -14,10 +23,14 @@
using namespace cl::sycl;

template <typename T, int Dim, class BinaryOperation>
class SomeClass;
class SomeIdClass;
template <typename T, int Dim, class BinaryOperation>
class SomeNoIdClass;

// Checks reductions initialized with transparent functor and explicitly set
// identity value.
template <typename T, int Dim, class BinaryOperation>
void test(T Identity, size_t WGSize, size_t NWItems) {
void testId(T Identity, size_t WGSize, size_t NWItems) {
buffer<T, 1> InBuf(NWItems);
buffer<T, 1> OutBuf(1);

Expand All @@ -32,13 +45,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
Out(OutBuf, CGH);
auto Redu = intel::reduction(Out, Identity, BOp);

range<1> GlobalRange(NWItems);
range<1> LocalRange(WGSize);
nd_range<1> NDRange(GlobalRange, LocalRange);
CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
Sum.combine(In[NDIt.get_global_linear_id()]);
});
});
Expand All @@ -54,12 +66,62 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
}
}

// Checks reductions initialized with transparent functor and identity
// value not explicitly specified. The parameter 'Identity' is passed here
// only to pre-initialize input data correctly.
template <typename T, int Dim, class BinaryOperation>
void testNoId(T Identity, size_t WGSize, size_t NWItems) {
buffer<T, 1> InBuf(NWItems);
buffer<T, 1> OutBuf(1);

// Initialize.
BinaryOperation BOp;
T CorrectOut;
initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);

// Compute.
queue Q;
Q.submit([&](handler &CGH) {
auto In = InBuf.template get_access<access::mode::read>(CGH);
accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
Out(OutBuf, CGH);

range<1> GlobalRange(NWItems);
range<1> LocalRange(WGSize);
nd_range<1> NDRange(GlobalRange, LocalRange);
CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
Sum.combine(In[NDIt.get_global_linear_id()]);
});
});

// Check correctness.
auto Out = OutBuf.template get_access<access::mode::read>();
T ComputedOut = *(Out.get_pointer());
if (ComputedOut != CorrectOut) {
std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
std::cout << "Computed value: " << ComputedOut
<< ", Expected value: " << CorrectOut << "\n";
assert(0 && "Wrong value.");
}
}

template <typename T, int Dim, class BinaryOperation>
void test(T Identity, size_t WGSize, size_t NWItems) {
testId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
testNoId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
}

int main() {
#if __cplusplus >= 201402L
test<double, 0, intel::maximum<>>(std::numeric_limits<double>::min(), 7, 7 * 5);
test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
test<signed char, 0, intel::plus<>>(0, 7, 49);
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
#endif
#ifndef SKIP_FOR_HALF
test<half, 1, intel::plus<>>(0, 4, 8);
test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
#endif // SKIP_FOR_HALF
#endif // __cplusplus >= 201402L

std::cout << "Test passed\n";
return 0;
Expand Down
6 changes: 2 additions & 4 deletions sycl/test/reduction/reduction_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,8 @@ int main() {
testUSM<int, 1, intel::bit_or<int>>(0, 4, 128);

// fast reduce
testUSM<float, 1, intel::minimum<float>>(
(std::numeric_limits<float>::max)(), 5, 5 * 7);
testUSM<float, 0, intel::maximum<float>>(
(std::numeric_limits<float>::min)(), 4, 128);
testUSM<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
testUSM<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);

// generic algorithm
testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
Expand Down
14 changes: 14 additions & 0 deletions sycl/test/reduction/reduction_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,17 @@ struct CustomVecPlus {
return CV(A.X + B.X, A.Y + B.Y);
}
};

template <typename T>
T getMinimumFPValue() {
return std::numeric_limits<T>::has_infinity
? -std::numeric_limits<T>::infinity()
: std::numeric_limits<T>::lowest();
}

template <typename T>
T getMaximumFPValue() {
return std::numeric_limits<T>::has_infinity
? std::numeric_limits<T>::infinity()
: (std::numeric_limits<T>::max)();
s-kanaev marked this conversation as resolved.
Show resolved Hide resolved
}