Skip to content

Commit 1ebd224

Browse files
tovinkereagainull
andauthored
[SYCL][XPTI] Performance improvements for performance oriented toolchains (#19953)
Implementation of XPTI within SYCL combined debug and performance streams over time and this PR improves the performance by 15-20% for toolschains by separating the debug data from performance streams. - Limits the amount of debug information that is attached to the default "sycl" performance stream. - If addition metadata is needed, toolchains will have to subscribe to "sycl.debug" stream - Unified runtime created a new trace event for each function call which adds significant overhead to each call when a collector is enabled. This is now minimized by using a global event for all of UR API calls. - CUDA plugin implementation of XPTI events enabled both performance and debug streams all the time, irrespective of where there was a tool subscribing to the data. - If tools want good performance then they need to subscribe to "sycl" stream. If they want full information and performance is not as important then subscribing to "sycl.debug" stream is the right choice. --------- Signed-off-by: Vasanth Tovinkere <vasanth.tovinkere@intel.com> Co-authored-by: Gainullin, Artur <artur.gainullin@intel.com>
1 parent 5d9a176 commit 1ebd224

File tree

19 files changed

+313
-245
lines changed

19 files changed

+313
-245
lines changed

sycl/doc/design/SYCLInstrumentationUsingXPTI.md

Lines changed: 18 additions & 15 deletions
Large diffs are not rendered by default.

sycl/source/detail/event_impl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ void event_impl::wait(bool *Success) {
288288
void *TelemetryEvent = nullptr;
289289
uint64_t IId = 0;
290290
std::string Name;
291-
TelemetryEvent = instrumentationProlog(Name, GSYCLStreamID, IId);
291+
auto StreamID = detail::getActiveXPTIStreamID();
292+
TelemetryEvent = instrumentationProlog(Name, StreamID, IId);
292293
#endif
293294

294295
auto EventHandle = getHandle();
@@ -300,7 +301,7 @@ void event_impl::wait(bool *Success) {
300301
detail::Scheduler::getInstance().waitForEvent(*this, Success);
301302

302303
#ifdef XPTI_ENABLE_INSTRUMENTATION
303-
instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId);
304+
instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
304305
#endif
305306
}
306307

sycl/source/detail/global_handler.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,8 @@ void GlobalHandler::TraceEventXPTI(const char *Message) {
9595
xpti::framework::tracepoint_scope_t TP(
9696
CodeLocation.fileName(), CodeLocation.functionName(),
9797
CodeLocation.lineNumber(), CodeLocation.columnNumber(), nullptr);
98-
99-
TP.stream(detail::GSYCLStreamID)
98+
// Notify the subscriber with a diagnostic message when an exception occurs.
99+
TP.stream(detail::getActiveXPTIStreamID())
100100
.traceType(xpti::trace_point_type_t::diagnostics)
101101
.parentEvent(GSYCLCallEvent)
102102
.notify(static_cast<const void *>(Message));

sycl/source/detail/graph/graph_impl.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -731,20 +731,20 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect(
731731
const bool xptiEnabled = xptiTraceEnabled();
732732
xpti_td *CmdTraceEvent = nullptr;
733733
uint64_t InstanceID = 0;
734+
auto StreamID = detail::getActiveXPTIStreamID();
734735
if (xptiEnabled) {
735736
sycl::detail::CGExecKernel *CGExec =
736737
static_cast<sycl::detail::CGExecKernel *>(Node.MCommandGroup.get());
737738
sycl::detail::code_location CodeLoc(CGExec->MFileName.c_str(),
738739
CGExec->MFunctionName.c_str(),
739740
CGExec->MLine, CGExec->MColumn);
740741
std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData(
741-
sycl::detail::GSYCLStreamID, CGExec->MSyclKernel, CodeLoc,
742-
CGExec->MIsTopCodeLoc, CGExec->MDeviceKernelInfo, nullptr,
743-
CGExec->MNDRDesc, CGExec->MKernelBundle.get(), CGExec->MArgs);
742+
StreamID, CGExec->MSyclKernel, CodeLoc, CGExec->MIsTopCodeLoc,
743+
CGExec->MDeviceKernelInfo, nullptr, CGExec->MNDRDesc,
744+
CGExec->MKernelBundle.get(), CGExec->MArgs);
744745
if (CmdTraceEvent)
745-
sycl::detail::emitInstrumentationGeneral(sycl::detail::GSYCLStreamID,
746-
InstanceID, CmdTraceEvent,
747-
xpti::trace_task_begin, nullptr);
746+
sycl::detail::emitInstrumentationGeneral(
747+
StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr);
748748
}
749749
#endif
750750

@@ -764,9 +764,8 @@ ur_exp_command_buffer_sync_point_t exec_graph_impl::enqueueNodeDirect(
764764

765765
#ifdef XPTI_ENABLE_INSTRUMENTATION
766766
if (xptiEnabled && CmdTraceEvent)
767-
sycl::detail::emitInstrumentationGeneral(sycl::detail::GSYCLStreamID,
768-
InstanceID, CmdTraceEvent,
769-
xpti::trace_task_end, nullptr);
767+
sycl::detail::emitInstrumentationGeneral(
768+
StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_end, nullptr);
770769
#endif
771770

772771
return NewSyncPoint;

sycl/source/detail/queue_impl.cpp

Lines changed: 71 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,12 @@ event queue_impl::memset(void *Ptr, int Value, size_t Count,
167167
xpti::framework::tracepoint_scope_t TP(
168168
CodeLocation.fileName(), FuncName, CodeLocation.lineNumber(),
169169
CodeLocation.columnNumber(), (void *)this);
170-
TP.stream(detail::GSYCLStreamID)
170+
TP.stream(detail::getActiveXPTIStreamID())
171171
.traceType(xpti::trace_point_type_t::node_create)
172172
.parentEvent(detail::GSYCLGraphEvent);
173173

174+
// This information is necessary for memset, so we will not guard it by debug
175+
// stream check.
174176
TP.addMetadata([&](auto TEvent) {
175177
xpti::addMetadata(TEvent, "sycl_device",
176178
reinterpret_cast<size_t>(MDevice.getHandleRef()));
@@ -219,10 +221,11 @@ event queue_impl::memcpy(void *Dest, const void *Src, size_t Count,
219221
xpti::framework::tracepoint_scope_t TP(
220222
CodeLoc.fileName(), CodeLoc.functionName(), CodeLoc.lineNumber(),
221223
CodeLoc.columnNumber(), (void *)this);
222-
TP.stream(detail::GSYCLStreamID)
224+
TP.stream(detail::getActiveXPTIStreamID())
223225
.traceType(xpti::trace_point_type_t::node_create)
224226
.parentEvent(GSYCLGraphEvent);
225227
const char *UserData = "memory_transfer_node::memcpy";
228+
// We will include this metadata information as it is required for memcpy.
226229
TP.addMetadata([&](auto TEvent) {
227230
xpti::addMetadata(TEvent, "sycl_device",
228231
reinterpret_cast<size_t>(MDevice.getHandleRef()));
@@ -515,33 +518,32 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
515518
if (!xptiCheckTraceEnabled(StreamID, NotificationTraceType))
516519
return TraceEvent;
517520

518-
xpti::payload_t Payload;
519-
bool HasSourceInfo = false;
521+
xpti_tracepoint_t *Event;
520522
// We try to create a unique string for the wait() call by combining it with
521523
// the queue address
522524
xpti::utils::StringHelper NG;
523525
Name = NG.nameWithAddress<queue_impl *>("queue.wait", this);
524526

525-
if (CodeLoc.fileName()) {
526-
// We have source code location information
527-
Payload =
528-
xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(),
529-
CodeLoc.columnNumber(), (void *)this);
530-
HasSourceInfo = true;
531-
} else {
532-
// We have no location information, so we'll use the address of the queue
533-
Payload = xpti::payload_t(Name.c_str(), (void *)this);
534-
}
527+
bool HasSourceInfo = CodeLoc.fileName() != nullptr;
535528
// wait() calls could be at different user-code locations; We create a new
536529
// event based on the code location info and if this has been seen before, a
537530
// previously created event will be returned.
538-
uint64_t QWaitInstanceNo = 0;
539-
xpti::trace_event_data_t *WaitEvent =
540-
xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
541-
xpti_at::active, &QWaitInstanceNo);
542-
IId = QWaitInstanceNo;
543-
if (WaitEvent) {
544-
xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this));
531+
if (HasSourceInfo) {
532+
Event = xptiCreateTracepoint(CodeLoc.functionName(), CodeLoc.fileName(),
533+
CodeLoc.lineNumber(), CodeLoc.columnNumber(),
534+
(void *)this);
535+
} else {
536+
Event = xptiCreateTracepoint(Name.c_str(), nullptr, 0, 0, (void *)this);
537+
}
538+
539+
IId = xptiGetUniqueId();
540+
auto WaitEvent = Event->event_ref();
541+
// We will allow the device type to be set
542+
xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this));
543+
// We limit the amount of metadata that is added to the regular stream.
544+
// Only "sycl.debug" stream will have the full information. This improves the
545+
// performance when this data is not required by the tool or the collector.
546+
if (isDebugStream(StreamID)) {
545547
if (HasSourceInfo) {
546548
xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
547549
xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
@@ -551,11 +553,11 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
551553
WaitEvent, "sym_column_no",
552554
static_cast<xpti::object_id_t>((CodeLoc.columnNumber())));
553555
}
554-
xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
555-
QWaitInstanceNo,
556-
static_cast<const void *>(Name.c_str()));
557-
TraceEvent = (void *)WaitEvent;
558556
}
557+
xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
558+
IId, static_cast<const void *>(Name.c_str()));
559+
TraceEvent = (void *)WaitEvent;
560+
559561
return TraceEvent;
560562
}
561563

@@ -578,13 +580,11 @@ void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name,
578580
void queue_impl::wait(const detail::code_location &CodeLoc) {
579581
(void)CodeLoc;
580582
#ifdef XPTI_ENABLE_INSTRUMENTATION
581-
const bool xptiEnabled = xptiCheckTraceEnabled(GSYCLStreamID);
582583
void *TelemetryEvent = nullptr;
583584
uint64_t IId;
584585
std::string Name;
585-
if (xptiEnabled) {
586-
TelemetryEvent = instrumentationProlog(CodeLoc, Name, GSYCLStreamID, IId);
587-
}
586+
auto StreamID = detail::getActiveXPTIStreamID();
587+
TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
588588
#endif
589589

590590
if (!MGraph.expired()) {
@@ -664,62 +664,62 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
664664
}
665665

666666
#ifdef XPTI_ENABLE_INSTRUMENTATION
667-
if (xptiEnabled) {
668-
instrumentationEpilog(TelemetryEvent, Name, GSYCLStreamID, IId);
669-
}
667+
// There is an early return in instrumentationEpilog() if no subscribers are
668+
// subscribing to queue.wait().
669+
instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
670670
#endif
671671
}
672672

673673
void queue_impl::constructorNotification() {
674674
#if XPTI_ENABLE_INSTRUMENTATION
675-
if (xptiTraceEnabled()) {
676-
constexpr uint16_t NotificationTraceType =
677-
static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
678-
if (xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) {
679-
xpti::utils::StringHelper SH;
680-
std::string AddrStr = SH.addressAsString<size_t>(MQueueID);
681-
std::string QueueName = SH.nameWithAddressString("queue", AddrStr);
682-
// Create a payload for the queue create event as we do not get code
683-
// location for the queue create event
684-
xpti::payload_t QPayload(QueueName.c_str());
685-
MInstanceID = xptiGetUniqueId();
686-
uint64_t RetInstanceNo;
687-
xpti_td *TEvent =
688-
xptiMakeEvent("queue_create", &QPayload,
689-
(uint16_t)xpti::trace_event_type_t::algorithm,
690-
xpti_at::active, &RetInstanceNo);
691-
// Cache the trace event, stream id and instance IDs for the destructor
692-
MTraceEvent = (void *)TEvent;
693-
694-
xpti::addMetadata(TEvent, "sycl_context",
695-
reinterpret_cast<size_t>(MContext->getHandleRef()));
696-
xpti::addMetadata(TEvent, "sycl_device_name",
697-
MDevice.get_info<info::device::name>());
698-
xpti::addMetadata(TEvent, "sycl_device",
699-
reinterpret_cast<size_t>(MDevice.getHandleRef()));
700-
xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
701-
xpti::addMetadata(TEvent, "queue_id", MQueueID);
702-
xpti::addMetadata(TEvent, "queue_handle",
703-
reinterpret_cast<size_t>(getHandleRef()));
704-
// Also publish to TLS before notification
705-
xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
706-
xptiNotifySubscribers(detail::GSYCLStreamID,
707-
(uint16_t)xpti::trace_point_type_t::queue_create,
708-
nullptr, TEvent, MInstanceID,
709-
static_cast<const void *>("queue_create"));
710-
}
711-
}
675+
// If there are no subscribers to queue_create, return immediately.
676+
constexpr uint16_t NotificationTraceType =
677+
static_cast<uint16_t>(xpti::trace_point_type_t::queue_create);
678+
if (!anyTraceEnabled(NotificationTraceType))
679+
return;
680+
// We do not have CodeLoc for the queue constructor, so we will have to create
681+
// a queue name with the queue ID to create an event; this step can be avoided
682+
// by using CodeLoc.
683+
xpti::utils::StringHelper SH;
684+
std::string AddrStr = SH.addressAsString<size_t>(MQueueID);
685+
std::string QueueName = SH.nameWithAddressString("queue", AddrStr);
686+
687+
xpti_tracepoint_t *Event =
688+
xptiCreateTracepoint(QueueName.c_str(), nullptr, 0, 0, (void *)this);
689+
MInstanceID = xptiGetUniqueId();
690+
xpti_td *TEvent = Event->event_ref();
691+
// Cache the trace event, stream id and instance IDs for the destructor.
692+
MTraceEvent = (void *)TEvent;
693+
// We will allow the queue metadata to be set as this is performed
694+
// infrequently.
695+
xpti::addMetadata(TEvent, "sycl_context",
696+
reinterpret_cast<size_t>(MContext->getHandleRef()));
697+
xpti::addMetadata(TEvent, "sycl_device_name",
698+
MDevice.get_info<info::device::name>());
699+
xpti::addMetadata(TEvent, "sycl_device",
700+
reinterpret_cast<size_t>(MDevice.getHandleRef()));
701+
xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
702+
xpti::addMetadata(TEvent, "queue_id", MQueueID);
703+
xpti::addMetadata(TEvent, "queue_handle",
704+
reinterpret_cast<size_t>(getHandleRef()));
705+
// Also publish to TLS before notification.
706+
xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
707+
xptiNotifySubscribers(detail::getActiveXPTIStreamID(),
708+
(uint16_t)xpti::trace_point_type_t::queue_create,
709+
nullptr, TEvent, MInstanceID,
710+
static_cast<const void *>("queue_create"));
712711
#endif
713712
}
714713

715714
void queue_impl::destructorNotification() {
716715
#if XPTI_ENABLE_INSTRUMENTATION
717716
constexpr uint16_t NotificationTraceType =
718717
static_cast<uint16_t>(xpti::trace_point_type_t::queue_destroy);
719-
if (xptiCheckTraceEnabled(detail::GSYCLStreamID, NotificationTraceType)) {
718+
if (anyTraceEnabled(NotificationTraceType)) {
720719
// Use the cached trace event, stream id and instance IDs for the
721720
// destructor
722-
xptiNotifySubscribers(detail::GSYCLStreamID, NotificationTraceType, nullptr,
721+
xptiNotifySubscribers(detail::getActiveXPTIStreamID(),
722+
NotificationTraceType, nullptr,
723723
(xpti::trace_event_data_t *)MTraceEvent, MInstanceID,
724724
static_cast<const void *>("queue_destroy"));
725725
xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent);

0 commit comments

Comments
 (0)