Skip to content

Commit 3df81ac

Browse files
authored
[SYCL] Add implementation of sycl::intel::barrier (#2198)
Exposes barrier as a free function, aligned with group_barrier from SYCL 2020 provisional. Naming is aligned with the existing DPC++ group algorithms, hence barrier in place of group_barrier. Signed-off-by: John Pennycook <john.pennycook@intel.com>
1 parent d5a7f20 commit 3df81ac

File tree

4 files changed

+129
-1
lines changed

4 files changed

+129
-1
lines changed

sycl/doc/extensions/GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ John Pennycook, Intel (john 'dot' pennycook 'at' intel 'dot' com)
5151

5252
== Dependencies
5353

54-
This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6.
54+
This extension is written against the SYCL 1.2.1 specification, Revision v1.2.1-6 and the following extensions:
55+
56+
- +SYCL_INTEL_extended_atomics+
5557

5658
== Overview
5759

@@ -67,6 +69,10 @@ The extension introduces the following functions:
6769
- +reduce+
6870
- +exclusive_scan+
6971
- +inclusive_scan+
72+
- +barrier+
73+
74+
The definitions and behavior of the following functions are based on equivalents in the SYCL 2020 provisional specification:
75+
- +barrier+
7076

7177
=== Alignment with OpenCL vs C++
7278

@@ -252,6 +258,21 @@ The return types of the collective functions in {cpp}17 are not deduced from the
252258
|Perform an inclusive scan over the values in the range [_first_, _last_) using the operator _binary_op_, which must be one of the group algorithms library function objects. The value written to +result + i+ is the inclusive scan of the first +i+ values in the range and an initial value specified by _init_. Returns a pointer to the end of the output range. _first_, _last_, _result_, _binary_op_ and _init_ must be the same for all work-items in the group. _binary_op(init, *first)_ must return a value of type _T_.
253259
|===
254260

261+
==== Synchronization
262+
263+
The behavior of memory fences in this section is aligned with the single happens-before relationship defined by the +SYCL_INTEL_extended_atomics+ extension.
264+
265+
|===
266+
|Function|Description
267+
268+
|+template <typename Group> void barrier(Group g);+
269+
|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items in the group after the barrier. The scope of the group memory fences implied by this barrier is the narrowest scope including all work-items in the group.
270+
271+
|+template <typename Group> void barrier(Group g, memory_scope scope);+
272+
|Synchronize all work-items in the group, and ensure that all memory accesses to any address space prior to the barrier are visible to all work-items specified by _scope_ after the barrier. The scope of the group memory fences implied by this barrier is controlled by _scope_ and must be broader than the narrowest scope including all work-items in the group. If the specified _scope_ is narrower than the narrowest scope including all work-items in the group, the _scope_ argument is ignored.
273+
274+
|===
275+
255276
== Issues
256277

257278
None.
@@ -270,6 +291,7 @@ None.
270291
|========================================
271292
|Rev|Date|Author|Changes
272293
|1|2020-01-30|John Pennycook|*Initial public working draft*
294+
|2|2020-07-28|John Pennycook|*Add group barrier*
273295
|========================================
274296
275297
//************************************************************************

sycl/doc/extensions/SubGroupAlgorithms/SYCL_INTEL_sub_group_algorithms.asciidoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ This extension adds sub-group support to all of the functions from +SYCL_INTEL_g
7070
- +reduce+
7171
- +exclusive_scan+
7272
- +inclusive_scan+
73+
- +barrier+
7374

7475
It additionally introduces a number of functions that are currently specific to sub-groups:
7576

@@ -165,6 +166,7 @@ None.
165166
|========================================
166167
|Rev|Date|Author|Changes
167168
|1|2020-03-16|John Pennycook|*Initial public working draft*
169+
|2|2020-07-28|John Pennycook|*Add group barrier*
168170
|========================================
169171
170172
//************************************************************************

sycl/include/CL/sycl/intel/group_algorithm.hpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <CL/sycl/detail/spirv.hpp>
1414
#include <CL/sycl/detail/type_traits.hpp>
1515
#include <CL/sycl/group.hpp>
16+
#include <CL/sycl/intel/atomic.hpp>
1617
#include <CL/sycl/intel/functional.hpp>
1718
#include <CL/sycl/intel/sub_group.hpp>
1819

@@ -77,6 +78,15 @@ template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) {
7778
return result;
7879
}
7980

81+
// TODO: Replace with Group::fence_scope from SYCL 2020 provisional
82+
template <typename Group> struct FenceScope {
83+
static constexpr intel::memory_scope value = intel::memory_scope::work_group;
84+
};
85+
86+
template <> struct FenceScope<intel::sub_group> {
87+
static constexpr intel::memory_scope value = intel::memory_scope::sub_group;
88+
};
89+
8090
template <typename T, class BinaryOperation> struct identity {};
8191

8292
template <typename T, typename V> struct identity<T, intel::plus<V>> {
@@ -896,6 +906,42 @@ template <typename Group> bool leader(Group g) {
896906
#endif
897907
}
898908

909+
template <typename Group> void barrier(Group, memory_scope scope) {
910+
static_assert(sycl::detail::is_generic_group<Group>::value,
911+
"Group algorithms only support the sycl::group and "
912+
"intel::sub_group class.");
913+
#ifdef __SYCL_DEVICE_ONLY__
914+
// MemoryScope must be broader than Group scope for correctness
915+
auto GroupScope = detail::FenceScope<Group>::value;
916+
auto BroadestScope = (scope > GroupScope) ? scope : GroupScope;
917+
auto MemoryScope = sycl::detail::spirv::getScope(BroadestScope);
918+
auto ExecutionScope = sycl::detail::spirv::group_scope<Group>::value;
919+
__spirv_ControlBarrier(ExecutionScope, MemoryScope,
920+
__spv::MemorySemanticsMask::AcquireRelease |
921+
__spv::MemorySemanticsMask::SubgroupMemory |
922+
__spv::MemorySemanticsMask::WorkgroupMemory |
923+
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
924+
#else
925+
(void)scope;
926+
throw runtime_error("Group algorithms are not supported on host device.",
927+
PI_INVALID_DEVICE);
928+
#endif
929+
}
930+
931+
template <typename Group> void barrier(Group g) {
932+
static_assert(sycl::detail::is_generic_group<Group>::value,
933+
"Group algorithms only support the sycl::group and "
934+
"intel::sub_group class.");
935+
#ifdef __SYCL_DEVICE_ONLY__
936+
auto MemoryScope = detail::FenceScope<Group>::value;
937+
barrier(g, MemoryScope);
938+
#else
939+
(void)g;
940+
throw runtime_error("Group algorithms are not supported on host device.",
941+
PI_INVALID_DEVICE);
942+
#endif
943+
}
944+
899945
} // namespace intel
900946
} // namespace sycl
901947
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/test/group-algorithm/barrier.cpp

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// UNSUPPORTED: cuda
2+
//
3+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
7+
8+
#include <CL/sycl.hpp>
9+
#include <cassert>
10+
using namespace sycl;
11+
using namespace sycl::intel;
12+
13+
class barrier_kernel;
14+
15+
void test(queue q) {
16+
17+
constexpr size_t N = 32;
18+
constexpr size_t L = 16;
19+
std::array<int, N> out;
20+
std::fill(out.begin(), out.end(), 0);
21+
{
22+
buffer<int> out_buf(out.data(), range<1>{N});
23+
q.submit([&](handler &cgh) {
24+
auto tmp =
25+
accessor<int, 1, access::mode::read_write, access::target::local>(
26+
L, cgh);
27+
auto out = out_buf.get_access<access::mode::read_write>(cgh);
28+
cgh.parallel_for<class barrier_kernel>(
29+
nd_range<1>(N, L), [=](nd_item<1> it) {
30+
group<1> g = it.get_group();
31+
tmp[it.get_local_linear_id()] = it.get_global_linear_id() + 1;
32+
barrier(g);
33+
int result = 0;
34+
for (int i = 0; i < L; ++i) {
35+
result += tmp[i];
36+
}
37+
out[it.get_global_linear_id()] = result;
38+
});
39+
});
40+
}
41+
42+
// Each work-item should see writes from all other work-items in its group
43+
for (int g = 0; g < N / L; ++g) {
44+
int sum = 0;
45+
for (int wi = 0; wi < L; ++wi) {
46+
sum += g * L + wi + 1;
47+
}
48+
for (int wi = 0; wi < L; ++wi) {
49+
assert(out[g * L + wi] == sum);
50+
}
51+
}
52+
}
53+
54+
int main() {
55+
queue q;
56+
test(q);
57+
std::cout << "Test passed." << std::endl;
58+
}

0 commit comments

Comments
 (0)