Skip to content

[SYCL] Extend broadcast to TriviallyCopyable types #2160

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
190 changes: 142 additions & 48 deletions sycl/include/CL/sycl/detail/spirv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,32 @@ template <> struct group_scope<::cl::sycl::intel::sub_group> {
static constexpr __spv::Scope::Flag value = __spv::Scope::Flag::Subgroup;
};

// Generic shuffles and broadcasts may require multiple calls to SPIR-V
// intrinsics, and should use the fewest broadcasts possible
// - Loop over 64-bit chunks until remaining bytes < 64-bit
// - At most one 32-bit, 16-bit and 8-bit chunk left over
template <typename T, typename Functor>
void GenericCall(const Functor &ApplyToBytes) {
if (sizeof(T) >= sizeof(uint64_t)) {
#pragma unroll
for (size_t Offset = 0; Offset < sizeof(T); Offset += sizeof(uint64_t)) {
ApplyToBytes(Offset, sizeof(uint64_t));
}
}
if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
ApplyToBytes(Offset, sizeof(uint32_t));
}
if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
ApplyToBytes(Offset, sizeof(uint16_t));
}
if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
ApplyToBytes(Offset, sizeof(uint8_t));
}
}

template <typename Group> bool GroupAll(bool pred) {
return __spirv_GroupAll(group_scope<Group>::value, pred);
}
Expand All @@ -41,47 +67,137 @@ template <typename Group> bool GroupAny(bool pred) {
return __spirv_GroupAny(group_scope<Group>::value, pred);
}

// Native broadcasts map directly to a SPIR-V GroupBroadcast intrinsic
template <typename T>
using is_native_broadcast = bool_constant<detail::is_arithmetic<T>::value>;

template <typename T, typename IdT = size_t>
using EnableIfNativeBroadcast = detail::enable_if_t<
is_native_broadcast<T>::value && std::is_integral<IdT>::value, T>;

// Bitcast broadcasts can be implemented using a single SPIR-V GroupBroadcast
// intrinsic, but require type-punning via an appropriate integer type
template <typename T>
using is_bitcast_broadcast = bool_constant<
!is_native_broadcast<T>::value && std::is_trivially_copyable<T>::value &&
(sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8)>;

template <typename T, typename IdT = size_t>
using EnableIfBitcastBroadcast = detail::enable_if_t<
is_bitcast_broadcast<T>::value && std::is_integral<IdT>::value, T>;

template <typename T>
using ConvertToNativeBroadcastType_t = select_cl_scalar_integral_unsigned_t<T>;

// Generic broadcasts may require multiple calls to SPIR-V GroupBroadcast
// intrinsics, and should use the fewest broadcasts possible
// - Loop over 64-bit chunks until remaining bytes < 64-bit
// - At most one 32-bit, 16-bit and 8-bit chunk left over
template <typename T>
using is_generic_broadcast =
bool_constant<!is_native_broadcast<T>::value &&
!is_bitcast_broadcast<T>::value &&
std::is_trivially_copyable<T>::value>;

template <typename T, typename IdT = size_t>
using EnableIfGenericBroadcast = detail::enable_if_t<
is_generic_broadcast<T>::value && std::is_integral<IdT>::value, T>;

// Broadcast with scalar local index
// Work-group supports any integral type
// Sub-group currently supports only uint32_t
template <typename Group> struct GroupId { using type = size_t; };
template <> struct GroupId<::cl::sycl::intel::sub_group> {
using type = uint32_t;
};
template <typename Group, typename T, typename IdT>
detail::enable_if_t<is_group<Group>::value && std::is_integral<IdT>::value, T>
GroupBroadcast(T x, IdT local_id) {
EnableIfNativeBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
using GroupIdT = typename GroupId<Group>::type;
GroupIdT GroupLocalId = static_cast<GroupIdT>(local_id);
using OCLT = detail::ConvertToOpenCLType_t<T>;
using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
OCLT ocl_x = detail::convertDataToType<T, OCLT>(x);
OCLIdT ocl_id = detail::convertDataToType<IdT, OCLIdT>(local_id);
return __spirv_GroupBroadcast(group_scope<Group>::value, ocl_x, ocl_id);
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
OCLT OCLX = detail::convertDataToType<T, OCLT>(x);
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
}
template <typename Group, typename T, typename IdT>
detail::enable_if_t<is_sub_group<Group>::value && std::is_integral<IdT>::value,
T>
GroupBroadcast(T x, IdT local_id) {
using SGIdT = uint32_t;
SGIdT sg_local_id = static_cast<SGIdT>(local_id);
using OCLT = detail::ConvertToOpenCLType_t<T>;
using OCLIdT = detail::ConvertToOpenCLType_t<SGIdT>;
OCLT ocl_x = detail::convertDataToType<T, OCLT>(x);
OCLIdT ocl_id = detail::convertDataToType<SGIdT, OCLIdT>(sg_local_id);
return __spirv_GroupBroadcast(group_scope<Group>::value, ocl_x, ocl_id);
EnableIfBitcastBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
using GroupIdT = typename GroupId<Group>::type;
GroupIdT GroupLocalId = static_cast<GroupIdT>(local_id);
using BroadcastT = ConvertToNativeBroadcastType_t<T>;
using OCLIdT = detail::ConvertToOpenCLType_t<GroupIdT>;
auto BroadcastX = detail::bit_cast<BroadcastT>(x);
OCLIdT OCLId = detail::convertDataToType<GroupIdT, OCLIdT>(GroupLocalId);
BroadcastT Result =
__spirv_GroupBroadcast(group_scope<Group>::value, BroadcastX, OCLId);
return detail::bit_cast<T>(Result);
}
template <typename Group, typename T, typename IdT>
EnableIfGenericBroadcast<T, IdT> GroupBroadcast(T x, IdT local_id) {
T Result;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
uint64_t BroadcastX, BroadcastResult;
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
};
GenericCall<T>(BroadcastBytes);
return Result;
}

// Broadcast with vector local index
template <typename Group, typename T, int Dimensions>
T GroupBroadcast(T x, id<Dimensions> local_id) {
EnableIfNativeBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
if (Dimensions == 1) {
return GroupBroadcast<Group>(x, local_id[0]);
}
using IdT = vec<size_t, Dimensions>;
using OCLT = detail::ConvertToOpenCLType_t<T>;
using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
IdT vec_id;
IdT VecId;
for (int i = 0; i < Dimensions; ++i) {
VecId[i] = local_id[Dimensions - i - 1];
}
OCLT OCLX = detail::convertDataToType<T, OCLT>(x);
OCLIdT OCLId = detail::convertDataToType<IdT, OCLIdT>(VecId);
return __spirv_GroupBroadcast(group_scope<Group>::value, OCLX, OCLId);
}
template <typename Group, typename T, int Dimensions>
EnableIfBitcastBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
if (Dimensions == 1) {
return GroupBroadcast<Group>(x, local_id[0]);
}
using IdT = vec<size_t, Dimensions>;
using BroadcastT = ConvertToNativeBroadcastType_t<T>;
using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
IdT VecId;
for (int i = 0; i < Dimensions; ++i) {
vec_id[i] = local_id[Dimensions - i - 1];
VecId[i] = local_id[Dimensions - i - 1];
}
OCLT ocl_x = detail::convertDataToType<T, OCLT>(x);
OCLIdT ocl_id = detail::convertDataToType<IdT, OCLIdT>(vec_id);
return __spirv_GroupBroadcast(group_scope<Group>::value, ocl_x, ocl_id);
auto BroadcastX = detail::bit_cast<BroadcastT>(x);
OCLIdT OCLId = detail::convertDataToType<IdT, OCLIdT>(VecId);
BroadcastT Result =
__spirv_GroupBroadcast(group_scope<Group>::value, BroadcastX, OCLId);
return detail::bit_cast<T>(Result);
}
template <typename Group, typename T, int Dimensions>
EnableIfGenericBroadcast<T> GroupBroadcast(T x, id<Dimensions> local_id) {
if (Dimensions == 1) {
return GroupBroadcast<Group>(x, local_id[0]);
}
T Result;
char *XBytes = reinterpret_cast<char *>(&x);
char *ResultBytes = reinterpret_cast<char *>(&Result);
auto BroadcastBytes = [=](size_t Offset, size_t Size) {
uint64_t BroadcastX, BroadcastResult;
detail::memcpy(&BroadcastX, XBytes + Offset, Size);
BroadcastResult = GroupBroadcast<Group>(BroadcastX, local_id);
detail::memcpy(ResultBytes + Offset, &BroadcastResult, Size);
};
GenericCall<T>(BroadcastBytes);
return Result;
}

// Single happens-before means semantics should always apply to all spaces
Expand Down Expand Up @@ -400,28 +516,6 @@ using EnableIfGenericShuffle =
sizeof(T) == 4 || sizeof(T) == 8)),
T>;

template <typename T, typename ShuffleFunctor>
void GenericShuffle(const ShuffleFunctor &ShuffleBytes) {
if (sizeof(T) >= sizeof(uint64_t)) {
#pragma unroll
for (size_t Offset = 0; Offset < sizeof(T); Offset += sizeof(uint64_t)) {
ShuffleBytes(Offset, sizeof(uint64_t));
}
}
if (sizeof(T) % sizeof(uint64_t) >= sizeof(uint32_t)) {
size_t Offset = sizeof(T) / sizeof(uint64_t) * sizeof(uint64_t);
ShuffleBytes(Offset, sizeof(uint32_t));
}
if (sizeof(T) % sizeof(uint32_t) >= sizeof(uint16_t)) {
size_t Offset = sizeof(T) / sizeof(uint32_t) * sizeof(uint32_t);
ShuffleBytes(Offset, sizeof(uint16_t));
}
if (sizeof(T) % sizeof(uint16_t) >= sizeof(uint8_t)) {
size_t Offset = sizeof(T) / sizeof(uint16_t) * sizeof(uint16_t);
ShuffleBytes(Offset, sizeof(uint8_t));
}
}

template <typename T>
EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
T Result;
Expand All @@ -433,7 +527,7 @@ EnableIfGenericShuffle<T> SubgroupShuffle(T x, id<1> local_id) {
ShuffleResult = SubgroupShuffle(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericShuffle<T>(ShuffleBytes);
GenericCall<T>(ShuffleBytes);
return Result;
}

Expand All @@ -448,7 +542,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleXor(T x, id<1> local_id) {
ShuffleResult = SubgroupShuffleXor(ShuffleX, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericShuffle<T>(ShuffleBytes);
GenericCall<T>(ShuffleBytes);
return Result;
}

Expand All @@ -465,7 +559,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleDown(T x, T y, id<1> local_id) {
ShuffleResult = SubgroupShuffleDown(ShuffleX, ShuffleY, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericShuffle<T>(ShuffleBytes);
GenericCall<T>(ShuffleBytes);
return Result;
}

Expand All @@ -482,7 +576,7 @@ EnableIfGenericShuffle<T> SubgroupShuffleUp(T x, T y, id<1> local_id) {
ShuffleResult = SubgroupShuffleUp(ShuffleX, ShuffleY, local_id);
detail::memcpy(ResultBytes + Offset, &ShuffleResult, Size);
};
GenericShuffle<T>(ShuffleBytes);
GenericCall<T>(ShuffleBytes);
return Result;
}

Expand Down
14 changes: 10 additions & 4 deletions sycl/include/CL/sycl/intel/group_algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,12 @@ template <typename Ptr, typename T>
using EnableIfIsPointer =
cl::sycl::detail::enable_if_t<cl::sycl::detail::is_pointer<Ptr>::value, T>;

template <typename T>
using EnableIfIsTriviallyCopyable = cl::sycl::detail::enable_if_t<
std::is_trivially_copyable<T>::value &&
!cl::sycl::detail::is_vector_arithmetic<T>::value,
T>;

// EnableIf shorthands for algorithms that depend on type and an operator
template <typename T, typename BinaryOperation>
using EnableIfIsScalarArithmeticNativeOp = cl::sycl::detail::enable_if_t<
Expand Down Expand Up @@ -286,8 +292,8 @@ EnableIfIsPointer<Ptr, bool> none_of(Group g, Ptr first, Ptr last,
}

template <typename Group, typename T>
EnableIfIsScalarArithmetic<T> broadcast(Group, T x,
typename Group::id_type local_id) {
EnableIfIsTriviallyCopyable<T> broadcast(Group, T x,
typename Group::id_type local_id) {
static_assert(sycl::detail::is_generic_group<Group>::value,
"Group algorithms only support the sycl::group and "
"intel::sub_group class.");
Expand Down Expand Up @@ -323,7 +329,7 @@ EnableIfIsVectorArithmetic<T> broadcast(Group g, T x,
}

template <typename Group, typename T>
EnableIfIsScalarArithmetic<T>
EnableIfIsTriviallyCopyable<T>
broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
static_assert(sycl::detail::is_generic_group<Group>::value,
"Group algorithms only support the sycl::group and "
Expand Down Expand Up @@ -363,7 +369,7 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
}

template <typename Group, typename T>
EnableIfIsScalarArithmetic<T> broadcast(Group g, T x) {
EnableIfIsTriviallyCopyable<T> broadcast(Group g, T x) {
static_assert(sycl::detail::is_generic_group<Group>::value,
"Group algorithms only support the sycl::group and "
"intel::sub_group class.");
Expand Down
51 changes: 45 additions & 6 deletions sycl/test/group-algorithm/broadcast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,19 @@
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
#include <complex>
#include <numeric>
using namespace sycl;
using namespace sycl::intel;

template <typename InputContainer, typename OutputContainer>
class broadcast_kernel;

template <typename InputContainer, typename OutputContainer>
void test(queue q, InputContainer input, OutputContainer output) {
typedef typename InputContainer::value_type InputT;
typedef typename OutputContainer::value_type OutputT;
typedef class broadcast_kernel kernel_name;
typedef class broadcast_kernel<InputContainer, OutputContainer> kernel_name;
size_t N = input.size();
size_t G = 4;
range<2> R(G, G);
Expand Down Expand Up @@ -54,12 +56,49 @@ int main() {
}

constexpr int N = 16;
std::array<int, N> input;
std::array<int, N> output;
std::iota(input.begin(), input.end(), 1);
std::fill(output.begin(), output.end(), false);

test(q, input, output);
// Test built-in scalar type
{
std::array<int, N> input;
std::array<int, 3> output;
std::iota(input.begin(), input.end(), 1);
std::fill(output.begin(), output.end(), false);
test(q, input, output);
}

// Test pointer type
{
std::array<int *, N> input;
std::array<int *, 3> output;
for (int i = 0; i < N; ++i) {
input[i] = static_cast<int *>(0x0) + i;
}
std::fill(output.begin(), output.end(), static_cast<int *>(0x0));
test(q, input, output);
}

// Test user-defined type
// - Use complex as a proxy for this
// - Test float and double to test 64-bit and 128-bit types
{
std::array<std::complex<float>, N> input;
std::array<std::complex<float>, 3> output;
for (int i = 0; i < N; ++i) {
input[i] =
std::complex<float>(0, 1) + (float)i * std::complex<float>(2, 2);
}
std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
test(q, input, output);
}
{
std::array<std::complex<double>, N> input;
std::array<std::complex<double>, 3> output;
for (int i = 0; i < N; ++i) {
input[i] =
std::complex<double>(0, 1) + (double)i * std::complex<double>(2, 2);
}
std::fill(output.begin(), output.end(), std::complex<float>(0, 0));
test(q, input, output);
}
std::cout << "Test passed." << std::endl;
}