Skip to content

Commit 93fef86

Browse files
authored
[SYCL][Graph] Permit empty & barrier nodes in WGU (#14236)
In order to enable the minimum viable GROMACS use case for the Whole Graph Update feature, allow graphs to contain empty nodes and barrier nodes during update. See discussion thread #13253 (comment) on SYCL-Graph spec PR for publicizing the availability of the Whole Graph Update feature.
1 parent 99635a0 commit 93fef86

File tree

5 files changed

+317
-11
lines changed

5 files changed

+317
-11
lines changed

sycl/source/detail/graph_impl.cpp

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,6 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
372372
(void)Args;
373373
sycl::handler Handler{Impl};
374374
CGF(Handler);
375-
Handler.finalize();
376375

377376
if (Handler.MCGType == sycl::detail::CG::Barrier) {
378377
throw sycl::exception(
@@ -381,6 +380,8 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
381380
"SYCL Graph Explicit API. Please use empty nodes instead.");
382381
}
383382

383+
Handler.finalize();
384+
384385
node_type NodeType =
385386
Handler.MImpl->MUserFacingNodeType !=
386387
ext::oneapi::experimental::node_type::empty
@@ -1236,18 +1237,22 @@ void exec_graph_impl::update(
12361237
sycl::make_error_code(errc::invalid),
12371238
"Node passed to update() is not part of the graph.");
12381239
}
1239-
if (Node->MCGType != sycl::detail::CG::Kernel) {
1240-
throw sycl::exception(errc::invalid, "Cannot update non-kernel nodes");
1241-
}
12421240

1243-
if (Node->MCommandGroup->getRequirements().size() == 0) {
1244-
continue;
1241+
if (!(Node->isEmpty() || Node->MCGType == sycl::detail::CG::Kernel ||
1242+
Node->MCGType == sycl::detail::CG::Barrier)) {
1243+
throw sycl::exception(errc::invalid,
1244+
"Unsupported node type for update. Only kernel, "
1245+
"barrier and empty nodes are supported.");
12451246
}
1246-
NeedScheduledUpdate = true;
12471247

1248-
UpdateRequirements.insert(UpdateRequirements.end(),
1249-
Node->MCommandGroup->getRequirements().begin(),
1250-
Node->MCommandGroup->getRequirements().end());
1248+
if (const auto &CG = Node->MCommandGroup;
1249+
CG && CG->getRequirements().size() != 0) {
1250+
NeedScheduledUpdate = true;
1251+
1252+
UpdateRequirements.insert(UpdateRequirements.end(),
1253+
Node->MCommandGroup->getRequirements().begin(),
1254+
Node->MCommandGroup->getRequirements().end());
1255+
}
12511256
}
12521257

12531258
// Clean up any execution events which have finished so we don't pass them to
@@ -1290,6 +1295,11 @@ void exec_graph_impl::update(
12901295
}
12911296

12921297
void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
1298+
// Kernel node update is the only command type supported in UR for update.
1299+
// Updating any other types of nodes, e.g. empty & barrier nodes is a no-op.
1300+
if (Node->MCGType != sycl::detail::CG::Kernel) {
1301+
return;
1302+
}
12931303
auto ContextImpl = sycl::detail::getSyclObjImpl(MContext);
12941304
const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin();
12951305
auto DeviceImpl = sycl::detail::getSyclObjImpl(MGraphImpl->getDevice());
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// Tests that whole graph update works when a graph contains a barrier node.
9+
10+
#include "../graph_common.hpp"
11+
12+
// Queue submissions that can be recorded to a graph, with a barrier node
13+
// separating initialization and computation kernel nodes
14+
template <class T>
15+
void RecordGraph(queue &Queue, size_t Size, T *Input1, T *Input2, T *Output) {
16+
Queue.submit([&](handler &CGH) {
17+
CGH.single_task([=]() {
18+
for (int i = 0; i < Size; i++) {
19+
Input1[i] += i;
20+
}
21+
});
22+
});
23+
24+
Queue.submit([&](handler &CGH) {
25+
CGH.single_task([=]() {
26+
for (int i = 0; i < Size; i++) {
27+
Input2[i] += i;
28+
}
29+
});
30+
});
31+
32+
Queue.ext_oneapi_submit_barrier();
33+
34+
Queue.submit([&](handler &CGH) {
35+
CGH.single_task([=]() {
36+
for (int i = 0; i < Size; i++) {
37+
Output[i] = Input1[i] * Input2[i];
38+
}
39+
});
40+
});
41+
}
42+
43+
int main() {
44+
queue Queue{};
45+
46+
using T = int;
47+
48+
// USM allocations for GraphA
49+
T *InputA1 = malloc_device<T>(Size, Queue);
50+
T *InputA2 = malloc_device<T>(Size, Queue);
51+
T *OutputA = malloc_device<T>(Size, Queue);
52+
53+
// Initialize USM allocations
54+
T Pattern1 = 0xA;
55+
T Pattern2 = 0x42;
56+
T PatternZero = 0;
57+
58+
Queue.fill(InputA1, Pattern1, Size);
59+
Queue.fill(InputA2, Pattern2, Size);
60+
Queue.fill(OutputA, PatternZero, Size);
61+
Queue.wait();
62+
63+
// Define GraphA
64+
exp_ext::command_graph GraphA{Queue};
65+
GraphA.begin_recording(Queue);
66+
RecordGraph(Queue, Size, InputA1, InputA2, OutputA);
67+
GraphA.end_recording();
68+
69+
// Finalize, run, and validate GraphA
70+
auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{});
71+
Queue.ext_oneapi_graph(GraphExecA).wait();
72+
73+
std::vector<T> HostOutput(Size);
74+
Queue.copy(OutputA, HostOutput.data(), Size).wait();
75+
76+
for (int i = 0; i < Size; i++) {
77+
T Ref = (Pattern1 + i) * (Pattern2 + i);
78+
assert(check_value(i, Ref, HostOutput[i], "OutputA"));
79+
}
80+
81+
// Create GraphB which will be used to update GraphA
82+
exp_ext::command_graph GraphB{Queue};
83+
84+
// USM allocations for GraphB
85+
T *InputB1 = malloc_device<T>(Size, Queue);
86+
T *InputB2 = malloc_device<T>(Size, Queue);
87+
T *OutputB = malloc_device<T>(Size, Queue);
88+
89+
// Initialize GraphB allocations
90+
Pattern1 = -42;
91+
Pattern2 = 0xF;
92+
93+
Queue.fill(InputB1, Pattern1, Size);
94+
Queue.fill(InputB2, Pattern2, Size);
95+
Queue.fill(OutputB, PatternZero, Size);
96+
Queue.wait();
97+
98+
// Create GraphB
99+
GraphB.begin_recording(Queue);
100+
RecordGraph(Queue, Size, InputB1, InputB2, OutputB);
101+
GraphB.end_recording();
102+
103+
// Update executable GraphA with GraphB, run, and validate
104+
GraphExecA.update(GraphB);
105+
Queue.ext_oneapi_graph(GraphExecA).wait();
106+
107+
Queue.copy(OutputB, HostOutput.data(), Size).wait();
108+
for (int i = 0; i < Size; i++) {
109+
T Ref = (Pattern1 + i) * (Pattern2 + i);
110+
assert(check_value(i, Ref, HostOutput[i], "OutputB"));
111+
}
112+
113+
free(InputA1, Queue);
114+
free(InputA2, Queue);
115+
free(OutputA, Queue);
116+
117+
free(InputB1, Queue);
118+
free(InputB2, Queue);
119+
free(OutputB, Queue);
120+
return 0;
121+
}
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
4+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
5+
// Extra run to check for immediate-command-list in Level Zero
6+
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
7+
8+
// Tests that whole graph update works when a graph contain an empty node.
9+
10+
#include "../graph_common.hpp"
11+
12+
// Creates a graph with an empty node separating initialization and computation
13+
// kernel nodes
14+
template <class T>
15+
void CreateGraph(
16+
exp_ext::command_graph<exp_ext::graph_state::modifiable> &Graph,
17+
size_t Size, T *Input1, T *Input2, T *Output) {
18+
Graph.add([&](handler &CGH) {
19+
CGH.single_task([=]() {
20+
for (int i = 0; i < Size; i++) {
21+
Input1[i] += i;
22+
}
23+
});
24+
});
25+
26+
Graph.add([&](handler &CGH) {
27+
CGH.single_task([=]() {
28+
for (int i = 0; i < Size; i++) {
29+
Input2[i] += i;
30+
}
31+
});
32+
});
33+
34+
auto EmptyNodeA =
35+
Graph.add({exp_ext::property::node::depends_on_all_leaves()});
36+
37+
Graph.add(
38+
[&](handler &CGH) {
39+
CGH.single_task([=]() {
40+
for (int i = 0; i < Size; i++) {
41+
Output[i] = Input1[i] * Input2[i];
42+
}
43+
});
44+
},
45+
{exp_ext::property::node::depends_on(EmptyNodeA)});
46+
}
47+
48+
int main() {
49+
queue Queue{};
50+
51+
using T = int;
52+
53+
// USM allocations for GraphA
54+
T *InputA1 = malloc_device<T>(Size, Queue);
55+
T *InputA2 = malloc_device<T>(Size, Queue);
56+
T *OutputA = malloc_device<T>(Size, Queue);
57+
58+
// Initialize USM allocations
59+
T Pattern1 = 0xA;
60+
T Pattern2 = 0x42;
61+
T PatternZero = 0;
62+
63+
Queue.fill(InputA1, Pattern1, Size);
64+
Queue.fill(InputA2, Pattern2, Size);
65+
Queue.fill(OutputA, PatternZero, Size);
66+
Queue.wait();
67+
68+
// Construct GraphA
69+
exp_ext::command_graph GraphA{Queue};
70+
CreateGraph(GraphA, Size, InputA1, InputA2, OutputA);
71+
72+
// Finalize, run, and validate GraphA
73+
auto GraphExecA = GraphA.finalize(exp_ext::property::graph::updatable{});
74+
Queue.ext_oneapi_graph(GraphExecA).wait();
75+
76+
std::vector<T> HostOutput(Size);
77+
Queue.copy(OutputA, HostOutput.data(), Size).wait();
78+
79+
for (int i = 0; i < Size; i++) {
80+
T Ref = (Pattern1 + i) * (Pattern2 + i);
81+
assert(check_value(i, Ref, HostOutput[i], "OutputA"));
82+
}
83+
84+
// Create GraphB which will be used to update GraphA
85+
exp_ext::command_graph GraphB{Queue};
86+
87+
// USM allocations for GraphB
88+
T *InputB1 = malloc_device<T>(Size, Queue);
89+
T *InputB2 = malloc_device<T>(Size, Queue);
90+
T *OutputB = malloc_device<T>(Size, Queue);
91+
92+
// Initialize GraphB
93+
Pattern1 = -42;
94+
Pattern2 = 0xF;
95+
96+
Queue.fill(InputB1, Pattern1, Size);
97+
Queue.fill(InputB2, Pattern2, Size);
98+
Queue.fill(OutputB, PatternZero, Size);
99+
Queue.wait();
100+
101+
// Construct GraphB
102+
CreateGraph(GraphB, Size, InputB1, InputB2, OutputB);
103+
104+
// Update executable GraphA with GraphB, run, and validate
105+
GraphExecA.update(GraphB);
106+
Queue.ext_oneapi_graph(GraphExecA).wait();
107+
108+
Queue.copy(OutputB, HostOutput.data(), Size).wait();
109+
110+
for (int i = 0; i < Size; i++) {
111+
T Ref = (Pattern1 + i) * (Pattern2 + i);
112+
assert(check_value(i, Ref, HostOutput[i], "OutputB"));
113+
}
114+
115+
free(InputA1, Queue);
116+
free(InputA2, Queue);
117+
free(OutputA, Queue);
118+
119+
free(InputB1, Queue);
120+
free(InputB2, Queue);
121+
free(OutputB, Queue);
122+
return 0;
123+
}

sycl/unittests/Extensions/CommandGraph/Exceptions.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,15 +216,25 @@ void addImagesCopies(experimental::detail::modifiable_command_graph &G,
216216
} // anonymous namespace
217217

218218
TEST_F(CommandGraphTest, ExplicitBarrierException) {
219-
219+
bool Success = true;
220220
std::error_code ExceptionCode = make_error_code(sycl::errc::success);
221221
try {
222222
auto Barrier =
223223
Graph.add([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); });
224224
} catch (exception &Exception) {
225225
ExceptionCode = Exception.code();
226+
std::string ErrorStr =
227+
"The sycl_ext_oneapi_enqueue_barrier feature is "
228+
"not available with SYCL Graph Explicit API. Please use empty nodes "
229+
"instead.";
230+
std::cout << Exception.what() << std::endl;
231+
std::cout << ErrorStr << std::endl;
232+
ASSERT_FALSE(std::string(Exception.what()).find(ErrorStr) ==
233+
std::string::npos);
234+
Success = false;
226235
}
227236
ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
237+
ASSERT_EQ(Success, false);
228238
}
229239

230240
TEST_F(CommandGraphTest, FusionExtensionExceptionCheck) {

sycl/unittests/Extensions/CommandGraph/Update.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,18 @@ TEST_F(CommandGraphTest, UpdateNodeTypeExceptions) {
109109
cgh.host_task([]() {});
110110
}));
111111

112+
ASSERT_ANY_THROW(auto NodeBarreriTask = Graph.add([&](sycl::handler &cgh) {
113+
cgh.set_arg(0, DynamicParam);
114+
cgh.ext_oneapi_barrier();
115+
}));
116+
117+
Graph.begin_recording(Queue);
118+
ASSERT_ANY_THROW(auto NodeBarrierTask = Graph.add([&](sycl::handler &cgh) {
119+
cgh.set_arg(0, DynamicParam);
120+
cgh.ext_oneapi_barrier();
121+
}));
122+
Graph.end_recording(Queue);
123+
112124
auto NodeEmpty = Graph.add();
113125

114126
experimental::command_graph Subgraph(Queue.get_context(), Dev);
@@ -375,3 +387,33 @@ TEST_F(WholeGraphUpdateTest, MissingUpdatableProperty) {
375387
auto GraphExec = Graph.finalize();
376388
EXPECT_THROW(GraphExec.update(UpdateGraph), sycl::exception);
377389
}
390+
391+
TEST_F(WholeGraphUpdateTest, EmptyNode) {
392+
// Test that updating a graph that has an empty node is not an error
393+
auto NodeEmpty = Graph.add();
394+
auto UpdateNodeEmpty = UpdateGraph.add();
395+
396+
auto NodeKernel = Graph.add(EmptyKernel);
397+
auto UpdateNodeKernel = UpdateGraph.add(EmptyKernel);
398+
399+
auto GraphExec = Graph.finalize(experimental::property::graph::updatable{});
400+
GraphExec.update(UpdateGraph);
401+
}
402+
403+
TEST_F(WholeGraphUpdateTest, BarrierNode) {
404+
// Test that updating a graph that has a barrier node is not an error
405+
Graph.begin_recording(Queue);
406+
auto NodeKernel = Queue.submit(
407+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
408+
Queue.ext_oneapi_submit_barrier({NodeKernel});
409+
Graph.end_recording(Queue);
410+
411+
UpdateGraph.begin_recording(Queue);
412+
auto UpdateNodeKernel = Queue.submit(
413+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
414+
Queue.ext_oneapi_submit_barrier({UpdateNodeKernel});
415+
UpdateGraph.end_recording(Queue);
416+
417+
auto GraphExec = Graph.finalize(experimental::property::graph::updatable{});
418+
GraphExec.update(UpdateGraph);
419+
}

0 commit comments

Comments
 (0)