Skip to content

Commit adaaaed

Browse files
[SYCL][Graph] Support for sycl_ext_oneapi_enqueue_barrier extension (#301)
* [SYCL][Graph] Support for sycl_ext_oneapi_enqueue_barrier extension Adds support to handle barrier enqueuing with Record&Replay API. Barriers are implemented as empty nodes enforcing the required dependencies. Adds tests that check 1) correctness of graph structure when barriers have been enqueued, 2) processing behavior, 3) exception throwing if barriers are used within explicit API. Notes: 1) Multi-queues barrier is not supported since it does not make sense with asynchronous graph execution. 2) Barriers can only be used with Record&Replay API, since barriers rely on events to enforce dependencies. * [SYCL][Graph] Adds unitest with multiple barriers and test-e2e Adds unitest with multiple barriers and test-e2e. Corrects some typos. * Update sycl/source/detail/graph_impl.cpp Co-authored-by: Ben Tracy <ben.tracy@codeplay.com> --------- Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
1 parent 5318388 commit adaaaed

File tree

6 files changed

+377
-106
lines changed

6 files changed

+377
-106
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2560,9 +2560,6 @@ class __SYCL_EXPORT handler {
25602560
/// until all commands previously submitted to this queue have entered the
25612561
/// complete state.
25622562
void ext_oneapi_barrier() {
2563-
throwIfGraphAssociated<
2564-
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
2565-
sycl_ext_oneapi_enqueue_barrier>();
25662563
throwIfActionIsCreated();
25672564
setType(detail::CG::Barrier);
25682565
}

sycl/source/detail/graph_impl.cpp

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,11 +213,17 @@ void graph_impl::removeRoot(const std::shared_ptr<node_impl> &Root) {
213213

214214
std::shared_ptr<node_impl>
215215
graph_impl::add(const std::vector<std::shared_ptr<node_impl>> &Dep) {
216+
// Copy deps so we can modify them
217+
auto Deps = Dep;
218+
216219
const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();
217220

221+
// Add any deps from the vector of extra dependencies
222+
Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());
223+
218224
// TODO: Encapsulate in separate function to avoid duplication
219-
if (!Dep.empty()) {
220-
for (auto N : Dep) {
225+
if (!Deps.empty()) {
226+
for (auto N : Deps) {
221227
N->registerSuccessor(NodeImpl, N); // register successor
222228
this->removeRoot(NodeImpl); // remove receiver from root node
223229
// list
@@ -239,6 +245,13 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
239245
CGF(Handler);
240246
Handler.finalize();
241247

248+
if (Handler.MCGType == sycl::detail::CG::Barrier) {
249+
throw sycl::exception(
250+
make_error_code(errc::invalid),
251+
"The sycl_ext_oneapi_enqueue_barrier feature is not available with "
252+
"SYCL Graph Explicit API. Please use empty nodes instead.");
253+
}
254+
242255
// If the handler recorded a subgraph return that here as the relevant nodes
243256
// have already been added. The node returned here is an empty node with
244257
// dependencies on all the exit nodes of the subgraph.
@@ -319,6 +332,9 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType,
319332
// list
320333
Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
321334

335+
// Add any deps from the extra dependencies vector
336+
Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end());
337+
322338
const std::shared_ptr<node_impl> &NodeImpl =
323339
std::make_shared<node_impl>(CGType, std::move(CommandGroup));
324340
if (!Deps.empty()) {
@@ -330,6 +346,12 @@ graph_impl::add(sycl::detail::CG::CGTYPE CGType,
330346
} else {
331347
this->addRoot(NodeImpl);
332348
}
349+
350+
// Set barrier nodes as prerequisites (new start points) for subsequent nodes
351+
if (CGType == sycl::detail::CG::Barrier) {
352+
MExtraDependencies.push_back(NodeImpl);
353+
}
354+
333355
return NodeImpl;
334356
}
335357

@@ -441,6 +463,20 @@ void graph_impl::makeEdge(std::shared_ptr<node_impl> Src,
441463
removeRoot(Dest); // remove receiver from root node list
442464
}
443465

466+
std::vector<sycl::detail::EventImplPtr> graph_impl::getExitNodesEvents() {
467+
std::vector<sycl::detail::EventImplPtr> Events;
468+
auto EnqueueExitNodesEvents = [&](std::shared_ptr<node_impl> &Node,
469+
std::deque<std::shared_ptr<node_impl>> &) {
470+
if (Node->MSuccessors.size() == 0) {
471+
Events.push_back(getEventForNode(Node));
472+
}
473+
return false;
474+
};
475+
476+
searchDepthFirst(EnqueueExitNodesEvents);
477+
return Events;
478+
}
479+
444480
// Check if nodes are empty and if so loop back through predecessors until we
445481
// find the real dependency.
446482
void exec_graph_impl::findRealDeps(

sycl/source/detail/graph_impl.hpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,13 @@ class node_impl {
160160
}
161161

162162
/// Query if this is an empty node.
163+
/// Barrier nodes are also considered empty nodes since they do not embed any
164+
/// workload but only dependencies
163165
/// @return True if this is an empty node, false otherwise.
164-
bool isEmpty() const { return MCGType == sycl::detail::CG::None; }
166+
bool isEmpty() const {
167+
return ((MCGType == sycl::detail::CG::None) ||
168+
(MCGType == sycl::detail::CG::Barrier));
169+
}
165170

166171
/// Get a deep copy of this node's command group
167172
/// @return A unique ptr to the new command group object.
@@ -319,8 +324,8 @@ class node_impl {
319324
printDotCG(Stream);
320325
for (const auto &Dep : MPredecessors) {
321326
auto NodeDep = Dep.lock();
322-
Stream << " \"" << MCommandGroup.get() << "\" -> \""
323-
<< NodeDep->MCommandGroup.get() << "\"" << std::endl;
327+
Stream << " \"" << NodeDep->MCommandGroup.get() << "\" -> \""
328+
<< MCommandGroup.get() << "\"" << std::endl;
324329
}
325330

326331
for (std::shared_ptr<node_impl> Succ : MSuccessors) {
@@ -677,6 +682,11 @@ class graph_impl {
677682
return NumberOfNodes;
678683
}
679684

685+
/// Traverse the graph recursively to get the events associated with the
686+
/// output nodes of this graph.
687+
/// @return vector of events associated to exit nodes.
688+
std::vector<sycl::detail::EventImplPtr> getExitNodesEvents();
689+
680690
private:
681691
/// Iterate over the graph depth-first and run \p NodeFunc on each node.
682692
/// @param NodeFunc A function which receives as input a node in the graph to
@@ -738,6 +748,12 @@ class graph_impl {
738748
/// @return An empty node is used to schedule dependencies on this sub-graph.
739749
std::shared_ptr<node_impl>
740750
addNodesToExits(const std::list<std::shared_ptr<node_impl>> &NodeList);
751+
752+
/// List of nodes that must be added as extra dependencies to new nodes when
753+
/// added to this graph.
754+
/// This list is mainly used by barrier nodes which must be considered
755+
/// as predecessors for all nodes subsequently added to the graph.
756+
std::vector<std::shared_ptr<node_impl>> MExtraDependencies;
741757
};
742758

743759
/// Class representing the implementation of command_graph<executable>.

sycl/source/handler.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -368,11 +368,28 @@ event handler::finalize() {
368368
std::move(MArgs), std::move(CGData), MCGType, MCodeLoc));
369369
break;
370370
case detail::CG::Barrier:
371-
case detail::CG::BarrierWaitlist:
372-
CommandGroup.reset(new detail::CGBarrier(std::move(MEventsWaitWithBarrier),
373-
std::move(CGData), MCGType,
374-
MCodeLoc));
371+
case detail::CG::BarrierWaitlist: {
372+
if (auto GraphImpl = getCommandGraph(); GraphImpl != nullptr) {
373+
// if no event to wait for was specified, we add all the previous
374+
// nodes/events of the graph
375+
if (MEventsWaitWithBarrier.size() == 0) {
376+
MEventsWaitWithBarrier = GraphImpl->getExitNodesEvents();
377+
}
378+
CGData.MEvents.insert(std::end(CGData.MEvents),
379+
std::begin(MEventsWaitWithBarrier),
380+
std::end(MEventsWaitWithBarrier));
381+
// Barrier node is implemented as an empty node in Graph
382+
// but keep the barrier type to help managing dependencies
383+
MCGType = detail::CG::Barrier;
384+
CommandGroup.reset(
385+
new detail::CG(detail::CG::Barrier, std::move(CGData), MCodeLoc));
386+
} else {
387+
CommandGroup.reset(
388+
new detail::CGBarrier(std::move(MEventsWaitWithBarrier),
389+
std::move(CGData), MCGType, MCodeLoc));
390+
}
375391
break;
392+
}
376393
case detail::CG::CopyToDeviceGlobal: {
377394
CommandGroup.reset(new detail::CGCopyToDeviceGlobal(
378395
MSrcPtr, MDstPtr, MImpl->MIsDeviceImageScoped, MLength, MImpl->MOffset,
@@ -806,9 +823,6 @@ void handler::verifyUsedKernelBundle(const std::string &KernelName) {
806823
}
807824

808825
void handler::ext_oneapi_barrier(const std::vector<event> &WaitList) {
809-
throwIfGraphAssociated<
810-
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
811-
sycl_ext_oneapi_enqueue_barrier>();
812826
throwIfActionIsCreated();
813827
MCGType = detail::CG::BarrierWaitlist;
814828
MEventsWaitWithBarrier.resize(WaitList.size());
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
// REQUIRES: level_zero, gpu
2+
// RUN: %{build} -o %t.out
3+
// RUN: %{run} %t.out
4+
// Extra run to check for leaks in Level Zero using ZE_DEBUG
5+
// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
6+
//
7+
// CHECK-NOT: LEAK
8+
9+
#include "../graph_common.hpp"
10+
11+
//// Test Explicit API graph construction with USM.
12+
///
13+
/// @param Q Command-queue to make kernel submissions to.
14+
/// @param Size Number of elements in the buffers.
15+
/// @param DataA Pointer to first USM allocation to use in kernels.
16+
/// @param DataB Pointer to second USM allocation to use in kernels.
17+
/// @param DataC Pointer to third USM allocation to use in kernels.
18+
///
19+
/// @return Event corresponding to the exit node of the submission sequence.
20+
template <typename T>
21+
event run_kernels_usm_with_barrier(queue Q, const size_t Size, T *DataA,
22+
T *DataB, T *DataC) {
23+
// Read & write Buffer A
24+
auto EventA = Q.submit([&](handler &CGH) {
25+
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
26+
auto LinID = Id.get_linear_id();
27+
DataA[LinID]++;
28+
});
29+
});
30+
31+
Q.ext_oneapi_submit_barrier();
32+
33+
// Reads Buffer A
34+
// Read & Write Buffer B
35+
auto EventB = Q.submit([&](handler &CGH) {
36+
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
37+
auto LinID = Id.get_linear_id();
38+
DataB[LinID] += DataA[LinID];
39+
});
40+
});
41+
42+
// Reads Buffer A
43+
// Read & writes Buffer C
44+
auto EventC = Q.submit([&](handler &CGH) {
45+
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
46+
auto LinID = Id.get_linear_id();
47+
DataC[LinID] -= DataA[LinID];
48+
});
49+
});
50+
51+
Q.ext_oneapi_submit_barrier();
52+
53+
// Read & write Buffers B and C
54+
auto ExitEvent = Q.submit([&](handler &CGH) {
55+
CGH.parallel_for(range<1>(Size), [=](item<1> Id) {
56+
auto LinID = Id.get_linear_id();
57+
DataB[LinID]--;
58+
DataC[LinID]--;
59+
});
60+
});
61+
return ExitEvent;
62+
}
63+
64+
int main() {
65+
queue Queue;
66+
67+
using T = int;
68+
69+
std::vector<T> DataA(Size), DataB(Size), DataC(Size);
70+
71+
std::iota(DataA.begin(), DataA.end(), 1);
72+
std::iota(DataB.begin(), DataB.end(), 10);
73+
std::iota(DataC.begin(), DataC.end(), 1000);
74+
75+
std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
76+
calculate_reference_data(Iterations, Size, ReferenceA, ReferenceB,
77+
ReferenceC);
78+
79+
exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
80+
81+
T *PtrA = malloc_device<T>(Size, Queue);
82+
T *PtrB = malloc_device<T>(Size, Queue);
83+
T *PtrC = malloc_device<T>(Size, Queue);
84+
85+
Queue.copy(DataA.data(), PtrA, Size);
86+
Queue.copy(DataB.data(), PtrB, Size);
87+
Queue.copy(DataC.data(), PtrC, Size);
88+
Queue.wait_and_throw();
89+
90+
// Add commands to graph
91+
Graph.begin_recording(Queue);
92+
auto ev = run_kernels_usm_with_barrier(Queue, Size, PtrA, PtrB, PtrC);
93+
Graph.end_recording(Queue);
94+
95+
auto GraphExec = Graph.finalize();
96+
97+
event Event;
98+
for (unsigned n = 0; n < Iterations; n++) {
99+
Event = Queue.submit([&](handler &CGH) {
100+
CGH.depends_on(Event);
101+
CGH.ext_oneapi_graph(GraphExec);
102+
});
103+
}
104+
Queue.wait_and_throw();
105+
106+
Queue.copy(PtrA, DataA.data(), Size);
107+
Queue.copy(PtrB, DataB.data(), Size);
108+
Queue.copy(PtrC, DataC.data(), Size);
109+
Queue.wait_and_throw();
110+
111+
free(PtrA, Queue);
112+
free(PtrB, Queue);
113+
free(PtrC, Queue);
114+
115+
assert(ReferenceA == DataA);
116+
assert(ReferenceB == DataB);
117+
assert(ReferenceC == DataC);
118+
119+
return 0;
120+
}

0 commit comments

Comments
 (0)