Skip to content

Commit 72341ee

Browse files
BensuoreblejulianmiEwanC
authored
[SYCL][Graph] Backend integration and feature additions for SYCL Graphs (3/4) (#10033)
# Backend integration and feature additions for SYCL Graphs This is the third patch of a series that adds support for an [experimental command graph extension](#5626) A snapshot of the complete work can be seen in draft PR #9375 which has support all the specification defined ways of adding nodes and edges to the graph, including both Explicit and Record & Replay graph construction. The two types of nodes currently implemented are kernel execution and memcpy commands. See https://github.com/reble/llvm#implementation-status for the status of our total work. ## Scope This third patch focuses on integrating the graphs runtime with the backend support added in #9992 as well as any remaining runtime features and bug fixes, and should include no ABI-breaking changes: * Graphs runtime changes to use PI/UR command-buffers. * Various improvements to the Graphs runtime classes. * New memory manager methods for appending copies to a command-buffer. * Changes to the Scheduler and related CG classes to enable Graphs. * Device info query for command-graph support. * Minor changes to some runtime classes to enable Graphs. ## Following Split PRs Future follow-up PRs with the remainder of our work on the extension will include: * Add end-to-end tests for SYCL Graph extension. (4/4) * NFC changes - Design doc and codeowner update. ## Authors Co-authored-by: Pablo Reble <pablo.reble@intel.com> Co-authored-by: Julian Miller <julian.miller@intel.com> Co-authored-by: Ben Tracy <ben.tracy@codeplay.com> Co-authored-by: Ewan Crawford <ewan@codeplay.com> Co-authored-by: Maxime France-Pillois <maxime.francepillois@codeplay.com>
1 parent 5d1da25 commit 72341ee

32 files changed

+1699
-231
lines changed

sycl/include/sycl/detail/cg.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,17 @@ class CGCopyFromDeviceGlobal : public CG {
497497
size_t getOffset() { return MOffset; }
498498
};
499499

500+
/// "Execute command-buffer" command group class.
501+
class CGExecCommandBuffer : public CG {
502+
public:
503+
sycl::detail::pi::PiExtCommandBuffer MCommandBuffer;
504+
505+
CGExecCommandBuffer(sycl::detail::pi::PiExtCommandBuffer CommandBuffer,
506+
CG::StorageInitHelper CGData)
507+
: CG(CGTYPE::ExecCommandBuffer, std::move(CGData)),
508+
MCommandBuffer(CommandBuffer) {}
509+
};
510+
500511
} // namespace detail
501512
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
502513
} // namespace sycl

sycl/include/sycl/detail/pi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2284,7 +2284,7 @@ __SYCL_EXPORT pi_result piextCommandBufferMemBufferCopyRect(
22842284
/// \param sync_point The sync_point associated with this memory operation.
22852285
__SYCL_EXPORT pi_result piextCommandBufferMemBufferRead(
22862286
pi_ext_command_buffer command_buffer, pi_mem buffer, size_t offset,
2287-
size_t size, void *dst, pi_uint32 num_events_in_wait_list,
2287+
size_t size, void *dst, pi_uint32 num_sync_points_in_wait_list,
22882288
const pi_ext_sync_point *sync_point_wait_list,
22892289
pi_ext_sync_point *sync_point);
22902290

sycl/include/sycl/ext/oneapi/experimental/graph.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -221,8 +221,6 @@ class __SYCL_EXPORT executable_command_graph {
221221

222222
int MTag;
223223
std::shared_ptr<detail::exec_graph_impl> impl;
224-
225-
friend class modifiable_command_graph;
226224
};
227225
} // namespace detail
228226

@@ -248,7 +246,10 @@ class command_graph : public detail::modifiable_command_graph {
248246
template <>
249247
class command_graph<graph_state::executable>
250248
: public detail::executable_command_graph {
251-
private:
249+
250+
protected:
251+
friend command_graph<graph_state::executable>
252+
detail::modifiable_command_graph::finalize(const sycl::property_list &) const;
252253
using detail::executable_command_graph::executable_command_graph;
253254
};
254255

sycl/include/sycl/handler.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1542,6 +1542,12 @@ class __SYCL_EXPORT handler {
15421542
setType(detail::CG::CodeplayHostTask);
15431543
}
15441544

1545+
/// @brief Get the command graph if any associated with this handler. It can
1546+
/// come from either the associated queue or from being set explicitly through
1547+
/// the appropriate constructor.
1548+
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
1549+
getCommandGraph() const;
1550+
15451551
public:
15461552
handler(const handler &) = delete;
15471553
handler(handler &&) = delete;

sycl/include/sycl/info/ext_oneapi_device_traits.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ __SYCL_PARAM_TRAITS_TEMPLATE_SPEC(ext::oneapi::experimental,device, max_work_gro
99
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, device, architecture,
1010
ext::oneapi::experimental::architecture,
1111
PI_EXT_ONEAPI_DEVICE_INFO_IP_VERSION)
12+
__SYCL_PARAM_TRAITS_SPEC(
13+
ext::oneapi::experimental, device, graph_support,
14+
ext::oneapi::experimental::info::graph_support_level,
15+
0 /* No PI device code needed */)
1216
#ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF
1317
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
1418
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF

sycl/include/sycl/info/info_desc.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,9 +185,14 @@ template <typename T, T param> struct compatibility_param_traits {};
185185
} /*namespace info */ \
186186
} /*namespace Namespace */
187187

188-
namespace ext::oneapi::experimental::info::device {
188+
namespace ext::oneapi::experimental::info {
189+
190+
enum class graph_support_level { unsupported = 0, native, emulated };
191+
192+
namespace device {
189193
template <int Dimensions> struct max_work_groups;
190-
} // namespace ext::oneapi::experimental::info::device
194+
} // namespace device
195+
} // namespace ext::oneapi::experimental::info
191196
#include <sycl/info/ext_codeplay_device_traits.def>
192197
#include <sycl/info/ext_intel_device_traits.def>
193198
#include <sycl/info/ext_oneapi_device_traits.def>

sycl/include/sycl/queue.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ static event submitAssertCapture(queue &, event &, queue *,
7676
#endif
7777
} // namespace detail
7878

79+
namespace ext {
80+
namespace oneapi {
81+
namespace experimental {
82+
// State of a queue with regards to graph recording,
83+
// returned by info::queue::state
84+
enum class queue_state { executing, recording };
85+
} // namespace experimental
86+
} // namespace oneapi
87+
} // namespace ext
88+
7989
/// Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
8090
///
8191
/// A SYCL queue can be used to submit command groups to be executed by the SYCL
@@ -283,6 +293,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
283293
/// \return SYCL device this queue was constructed with.
284294
device get_device() const;
285295

296+
/// \return State the queue is currently in.
297+
ext::oneapi::experimental::queue_state ext_oneapi_get_state() const;
298+
286299
/// \return true if this queue is a SYCL host queue.
287300
__SYCL2020_DEPRECATED(
288301
"is_host() is deprecated as the host device is no longer supported.")

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,46 +1118,46 @@ pi_result piextCommandBufferMemBufferCopyRect(
11181118

11191119
pi_result piextCommandBufferMemBufferRead(
11201120
pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset,
1121-
size_t Size, void *Dst, pi_uint32 NumEventsInWaitList,
1121+
size_t Size, void *Dst, pi_uint32 NumSyncPointsInWaitList,
11221122
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
1123-
return pi2ur::piextCommandBufferMemBufferRead(CommandBuffer, Buffer, Offset,
1124-
Size, Dst, NumEventsInWaitList,
1125-
SyncPointWaitList, SyncPoint);
1123+
return pi2ur::piextCommandBufferMemBufferRead(
1124+
CommandBuffer, Buffer, Offset, Size, Dst, NumSyncPointsInWaitList,
1125+
SyncPointWaitList, SyncPoint);
11261126
}
11271127

11281128
pi_result piextCommandBufferMemBufferReadRect(
11291129
pi_ext_command_buffer CommandBuffer, pi_mem Buffer,
11301130
pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
11311131
pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
11321132
size_t HostRowPitch, size_t HostSlicePitch, void *Ptr,
1133-
pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList,
1134-
pi_ext_sync_point *SyncPoint) {
1133+
pi_uint32 NumSyncPointsInWaitList,
1134+
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
11351135
return pi2ur::piextCommandBufferMemBufferReadRect(
11361136
CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch,
1137-
BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList,
1138-
SyncPointWaitList, SyncPoint);
1137+
BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
1138+
NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
11391139
}
11401140

11411141
pi_result piextCommandBufferMemBufferWrite(
11421142
pi_ext_command_buffer CommandBuffer, pi_mem Buffer, size_t Offset,
1143-
size_t Size, const void *Ptr, pi_uint32 NumEventsInWaitList,
1143+
size_t Size, const void *Ptr, pi_uint32 NumSyncPointsInWaitList,
11441144
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
1145-
return pi2ur::piextCommandBufferMemBufferWrite(CommandBuffer, Buffer, Offset,
1146-
Size, Ptr, NumEventsInWaitList,
1147-
SyncPointWaitList, SyncPoint);
1145+
return pi2ur::piextCommandBufferMemBufferWrite(
1146+
CommandBuffer, Buffer, Offset, Size, Ptr, NumSyncPointsInWaitList,
1147+
SyncPointWaitList, SyncPoint);
11481148
}
11491149

11501150
pi_result piextCommandBufferMemBufferWriteRect(
11511151
pi_ext_command_buffer CommandBuffer, pi_mem Buffer,
11521152
pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset,
11531153
pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch,
11541154
size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr,
1155-
pi_uint32 NumEventsInWaitList, const pi_ext_sync_point *SyncPointWaitList,
1156-
pi_ext_sync_point *SyncPoint) {
1155+
pi_uint32 NumSyncPointsInWaitList,
1156+
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
11571157
return pi2ur::piextCommandBufferMemBufferWriteRect(
11581158
CommandBuffer, Buffer, BufferOffset, HostOffset, Region, BufferRowPitch,
1159-
BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr, NumEventsInWaitList,
1160-
SyncPointWaitList, SyncPoint);
1159+
BufferSlicePitch, HostRowPitch, HostSlicePitch, Ptr,
1160+
NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
11611161
}
11621162

11631163
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2726,6 +2726,12 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
27262726
_PI_CL(piextCommandBufferMemBufferCopy, piextCommandBufferMemBufferCopy)
27272727
_PI_CL(piextCommandBufferMemBufferCopyRect,
27282728
piextCommandBufferMemBufferCopyRect)
2729+
_PI_CL(piextCommandBufferMemBufferRead, piextCommandBufferMemBufferRead)
2730+
_PI_CL(piextCommandBufferMemBufferReadRect,
2731+
piextCommandBufferMemBufferReadRect)
2732+
_PI_CL(piextCommandBufferMemBufferWrite, piextCommandBufferMemBufferWrite)
2733+
_PI_CL(piextCommandBufferMemBufferWriteRect,
2734+
piextCommandBufferMemBufferWriteRect)
27292735
_PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer)
27302736

27312737
_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)

sycl/plugins/unified_runtime/ur/adapters/level_zero/command_buffer.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -535,9 +535,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
535535
uint32_t NumSyncPointsInWaitList,
536536
const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
537537
ur_exp_command_buffer_sync_point_t *SyncPoint) {
538-
(void)SrcOffset;
539-
(void)DstOffset;
540-
541538
auto SrcBuffer = ur_cast<ur_mem_handle_t>(SrcMem);
542539
auto DstBuffer = ur_cast<ur_mem_handle_t>(DstMem);
543540

@@ -553,8 +550,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyExp(
553550
CommandBuffer->Device));
554551

555552
return enqueueCommandBufferMemCopyHelper(
556-
UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst, ZeHandleSrc, Size,
557-
NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint);
553+
UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst + DstOffset,
554+
ZeHandleSrc + SrcOffset, Size, NumSyncPointsInWaitList, SyncPointWaitList,
555+
SyncPoint);
558556
}
559557

560558
UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMembufferCopyRectExp(

sycl/plugins/unified_runtime/ur/adapters/level_zero/device.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
186186
(Device->ZeDeviceProperties->deviceId & 0xff0) == 0xbd0)
187187
SupportedExtensions += ("cl_intel_bfloat16_conversions ");
188188

189+
// Return supported for the UR command-buffer experimental feature
190+
SupportedExtensions += ("ur_exp_command_buffer ");
191+
189192
return ReturnValue(SupportedExtensions.c_str());
190193
}
191194
case UR_DEVICE_INFO_NAME:

sycl/source/detail/device_info.hpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,35 @@ struct get_device_info_impl<
884884
}
885885
};
886886

887+
// Specialization for graph extension support
888+
template <>
889+
struct get_device_info_impl<
890+
ext::oneapi::experimental::info::graph_support_level,
891+
ext::oneapi::experimental::info::device::graph_support> {
892+
static ext::oneapi::experimental::info::graph_support_level
893+
get(const DeviceImplPtr &Dev) {
894+
size_t ResultSize = 0;
895+
Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
896+
Dev->getHandleRef(), PI_DEVICE_INFO_EXTENSIONS, 0, nullptr,
897+
&ResultSize);
898+
if (ResultSize == 0)
899+
return ext::oneapi::experimental::info::graph_support_level::unsupported;
900+
901+
std::unique_ptr<char[]> Result(new char[ResultSize]);
902+
Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
903+
Dev->getHandleRef(), PI_DEVICE_INFO_EXTENSIONS, ResultSize,
904+
Result.get(), nullptr);
905+
906+
std::string_view ExtensionsString(Result.get());
907+
bool CmdBufferSupport =
908+
ExtensionsString.find("ur_exp_command_buffer") != std::string::npos;
909+
return CmdBufferSupport
910+
? ext::oneapi::experimental::info::graph_support_level::native
911+
: ext::oneapi::experimental::info::graph_support_level::
912+
unsupported;
913+
}
914+
};
915+
887916
template <typename Param>
888917
typename Param::return_type get_device_info(const DeviceImplPtr &Dev) {
889918
static_assert(is_device_info_desc<Param>::value,
@@ -1778,6 +1807,13 @@ inline uint32_t get_device_info_host<
17781807
PI_ERROR_INVALID_DEVICE);
17791808
}
17801809

1810+
template <>
1811+
inline ext::oneapi::experimental::info::graph_support_level
1812+
get_device_info_host<ext::oneapi::experimental::info::device::graph_support>() {
1813+
// No support for graphs on the host device.
1814+
return ext::oneapi::experimental::info::graph_support_level::unsupported;
1815+
}
1816+
17811817
} // namespace detail
17821818
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
17831819
} // namespace sycl

sycl/source/detail/event_impl.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ void event_impl::wait(std::shared_ptr<sycl::detail::event_impl> Self) {
226226
throw sycl::exception(make_error_code(errc::invalid),
227227
"wait method cannot be used for a discarded event.");
228228

229+
if (MGraph.lock()) {
230+
throw sycl::exception(make_error_code(errc::invalid),
231+
"wait method cannot be used for an event associated "
232+
"with a command graph.");
233+
}
234+
229235
#ifdef XPTI_ENABLE_INSTRUMENTATION
230236
void *TelemetryEvent = nullptr;
231237
uint64_t IId;

sycl/source/detail/event_impl.hpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323

2424
namespace sycl {
2525
__SYCL_INLINE_VER_NAMESPACE(_V1) {
26+
namespace ext::oneapi::experimental::detail {
27+
class graph_impl;
28+
}
2629
class context;
2730
namespace detail {
2831
class plugin;
@@ -256,6 +259,25 @@ class event_impl {
256259
return MContext;
257260
}
258261

262+
// Sets a sync point which is used when this event represents an enqueue to a
263+
// Command Bufferr.
264+
void setSyncPoint(sycl::detail::pi::PiExtSyncPoint SyncPoint) {
265+
MSyncPoint = SyncPoint;
266+
}
267+
268+
// Get the sync point associated with this event.
269+
sycl::detail::pi::PiExtSyncPoint getSyncPoint() const { return MSyncPoint; }
270+
271+
void setCommandGraph(
272+
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
273+
MGraph = Graph;
274+
}
275+
276+
std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
277+
getCommandGraph() const {
278+
return MGraph.lock();
279+
}
280+
259281
protected:
260282
// When instrumentation is enabled emits trace event for event wait begin and
261283
// returns the telemetry event generated for the wait
@@ -302,6 +324,15 @@ class event_impl {
302324
std::mutex MMutex;
303325
std::condition_variable cv;
304326

327+
/// Store the command graph associated with this event, if any.
328+
/// This event is also be stored in the graph so a weak_ptr is used.
329+
std::weak_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
330+
331+
// If this event represents a submission to a
332+
// sycl::detail::pi::PiExtCommandBuffer the sync point for that submission is
333+
// stored here.
334+
sycl::detail::pi::PiExtSyncPoint MSyncPoint;
335+
305336
friend std::vector<sycl::detail::pi::PiEvent>
306337
getOrWaitEvents(std::vector<sycl::event> DepEvents,
307338
std::shared_ptr<sycl::detail::context_impl> Context);

0 commit comments

Comments
 (0)