Skip to content

Commit 23e180b

Browse files
[SYCL] Implement queue flushing (#5052)
According to OpenCL spec: "To use event objects that refer to commands enqueued in a command-queue as event objects to wait on by commands enqueued in a different command-queue, the application must call a clFlush or any blocking commands that perform an implicit flush of the command-queue where the commands that refer to these event objects are enqueued." This patch adds a piQueueFlush function that's lowered to clFinish in OpenCL and called whenever an unsubmitted cross-queue dependency is about to be used.
1 parent 57255ce commit 23e180b

File tree

14 files changed

+380
-7
lines changed

14 files changed

+380
-7
lines changed

sycl/include/CL/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ _PI_API(piextContextCreateWithNativeHandle)
4545
_PI_API(piQueueCreate)
4646
_PI_API(piQueueGetInfo)
4747
_PI_API(piQueueFinish)
48+
_PI_API(piQueueFlush)
4849
_PI_API(piQueueRetain)
4950
_PI_API(piQueueRelease)
5051
_PI_API(piextQueueGetNativeHandle)

sycl/include/CL/sycl/detail/pi.h

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@
4040
// changes the API version from 3.5 to 4.6.
4141
// 5.7 Added new context and ownership arguments to
4242
// piextEventCreateWithNativeHandle
43-
// 6.8 Added new ownership argument to piextProgramCreateWithNativeHandle.
43+
// 6.8 Added new ownership argument to piextProgramCreateWithNativeHandle. Added
44+
// piQueueFlush function.
4445
//
4546
#include "CL/cl.h"
46-
#define _PI_H_VERSION_MAJOR 5
47-
#define _PI_H_VERSION_MINOR 7
47+
#define _PI_H_VERSION_MAJOR 6
48+
#define _PI_H_VERSION_MINOR 8
4849

4950
#define _PI_STRING_HELPER(a) #a
5051
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1083,6 +1084,8 @@ __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue);
10831084

10841085
__SYCL_EXPORT pi_result piQueueFinish(pi_queue command_queue);
10851086

1087+
__SYCL_EXPORT pi_result piQueueFlush(pi_queue command_queue);
1088+
10861089
/// Gets the native handle of a PI queue object.
10871090
///
10881091
/// \param queue is the PI queue to get the native handle of.

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2254,6 +2254,11 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) {
22542254
return result;
22552255
}
22562256

2257+
// There is no CUDA counterpart for queue flushing and we don't run into the
2258+
// same problem of having to flush cross-queue dependencies as some of the
2259+
// other plugins, so it can be left as no-op.
2260+
pi_result cuda_piQueueFlush(pi_queue command_queue) { return PI_SUCCESS; }
2261+
22572262
/// Gets the native CUDA handle of a PI queue object
22582263
///
22592264
/// \param[in] queue The PI queue to get the native CUDA object of.
@@ -4886,6 +4891,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
48864891
_PI_CL(piQueueCreate, cuda_piQueueCreate)
48874892
_PI_CL(piQueueGetInfo, cuda_piQueueGetInfo)
48884893
_PI_CL(piQueueFinish, cuda_piQueueFinish)
4894+
_PI_CL(piQueueFlush, cuda_piQueueFlush)
48894895
_PI_CL(piQueueRetain, cuda_piQueueRetain)
48904896
_PI_CL(piQueueRelease, cuda_piQueueRelease)
48914897
_PI_CL(piextQueueGetNativeHandle, cuda_piextQueueGetNativeHandle)

sycl/plugins/hip/pi_hip.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2202,6 +2202,11 @@ pi_result hip_piQueueFinish(pi_queue command_queue) {
22022202
return result;
22032203
}
22042204

2205+
// There is no HIP counterpart for queue flushing and we don't run into the
2206+
// same problem of having to flush cross-queue dependencies as some of the
2207+
// other plugins, so it can be left as no-op.
2208+
pi_result hip_piQueueFlush(pi_queue command_queue) { return PI_SUCCESS; }
2209+
22052210
/// Gets the native HIP handle of a PI queue object
22062211
///
22072212
/// \param[in] queue The PI queue to get the native HIP object of.
@@ -4820,6 +4825,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
48204825
_PI_CL(piQueueCreate, hip_piQueueCreate)
48214826
_PI_CL(piQueueGetInfo, hip_piQueueGetInfo)
48224827
_PI_CL(piQueueFinish, hip_piQueueFinish)
4828+
_PI_CL(piQueueFlush, hip_piQueueFlush)
48234829
_PI_CL(piQueueRetain, hip_piQueueRetain)
48244830
_PI_CL(piQueueRelease, hip_piQueueRelease)
48254831
_PI_CL(piextQueueGetNativeHandle, hip_piextQueueGetNativeHandle)

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3029,6 +3029,10 @@ pi_result piQueueFinish(pi_queue Queue) {
30293029
return PI_SUCCESS;
30303030
}
30313031

3032+
// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
3033+
// so this can be left as a no-op.
3034+
pi_result piQueueFlush(pi_queue Queue) { return PI_SUCCESS; }
3035+
30323036
pi_result piextQueueGetNativeHandle(pi_queue Queue,
30333037
pi_native_handle *NativeHandle) {
30343038
PI_ASSERT(Queue, PI_INVALID_QUEUE);

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,6 +1424,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
14241424
_PI_CL(piQueueCreate, piQueueCreate)
14251425
_PI_CL(piQueueGetInfo, clGetCommandQueueInfo)
14261426
_PI_CL(piQueueFinish, clFinish)
1427+
_PI_CL(piQueueFlush, clFlush)
14271428
_PI_CL(piQueueRetain, clRetainCommandQueue)
14281429
_PI_CL(piQueueRelease, clReleaseCommandQueue)
14291430
_PI_CL(piextQueueGetNativeHandle, piextQueueGetNativeHandle)

sycl/source/detail/event_impl.cpp

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,12 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
9393
MState = HES_NotComplete;
9494
}
9595

96-
event_impl::event_impl() : MState(HES_Complete) {}
96+
event_impl::event_impl() : MIsFlushed(true), MState(HES_Complete) {}
9797

9898
event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
9999
: MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
100-
MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) {
100+
MOpenCLInterop(true), MHostEvent(false), MIsFlushed(true),
101+
MState(HES_Complete) {
101102

102103
if (MContext->is_host()) {
103104
throw cl::sycl::invalid_parameter_error(
@@ -120,7 +121,7 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
120121
getPlugin().call<PiApiKind::piEventRetain>(MEvent);
121122
}
122123

123-
event_impl::event_impl(QueueImplPtr Queue) {
124+
event_impl::event_impl(const QueueImplPtr &Queue) : MQueue{Queue} {
124125
if (Queue->is_host()) {
125126
MState.store(HES_NotComplete);
126127

@@ -344,6 +345,32 @@ std::vector<EventImplPtr> event_impl::getWaitList() {
344345
return Result;
345346
}
346347

348+
void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
349+
assert(MEvent != nullptr);
350+
if (MIsFlushed)
351+
return;
352+
353+
QueueImplPtr Queue = MQueue.lock();
354+
// If the queue has been released, all of the commands have already been
355+
// implicitly flushed by piQueueRelease.
356+
if (!Queue) {
357+
MIsFlushed = true;
358+
return;
359+
}
360+
if (Queue == UserQueue)
361+
return;
362+
363+
// Check if the task for this event has already been submitted.
364+
pi_event_status Status = PI_EVENT_QUEUED;
365+
getPlugin().call<PiApiKind::piEventGetInfo>(
366+
MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status,
367+
nullptr);
368+
if (Status == PI_EVENT_QUEUED) {
369+
getPlugin().call<PiApiKind::piQueueFlush>(Queue->getHandleRef());
370+
}
371+
MIsFlushed = true;
372+
}
373+
347374
void event_impl::cleanupDependencyEvents() {
348375
std::lock_guard<std::mutex> Lock(MMutex);
349376
MPreparedDepsEvents.clear();

sycl/source/detail/event_impl.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class event_impl {
4444
/// \param Event is a valid instance of plug-in event.
4545
/// \param SyclContext is an instance of SYCL context.
4646
event_impl(RT::PiEvent Event, const context &SyclContext);
47-
event_impl(QueueImplPtr Queue);
47+
event_impl(const QueueImplPtr &Queue);
4848

4949
/// Checks if this event is a SYCL host event.
5050
///
@@ -182,6 +182,11 @@ class event_impl {
182182
/// @return a vector of "immediate" dependencies for this event_impl.
183183
std::vector<EventImplPtr> getWaitList();
184184

185+
/// Performs a flush on the queue associated with this event if the user queue
186+
/// is different and the task associated with this event hasn't been submitted
187+
/// to the device yet.
188+
void flushIfNeeded(const QueueImplPtr &UserQueue);
189+
185190
/// Cleans dependencies of this event_impl
186191
void cleanupDependencyEvents();
187192

@@ -200,11 +205,16 @@ class event_impl {
200205
bool MHostEvent = true;
201206
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
202207
void *MCommand = nullptr;
208+
std::weak_ptr<queue_impl> MQueue;
203209

204210
/// Dependency events prepared for waiting by backend.
205211
std::vector<EventImplPtr> MPreparedDepsEvents;
206212
std::vector<EventImplPtr> MPreparedHostDepsEvents;
207213

214+
/// Indicates that the task associated with this event has been submitted by
215+
/// the queue to the device.
216+
std::atomic<bool> MIsFlushed = false;
217+
208218
enum HostEventState : int { HES_NotComplete = 0, HES_Complete };
209219

210220
// State of host event. Employed only for host events and event with no

sycl/source/detail/scheduler/commands.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,13 @@ getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
169169
return RetPiEvents;
170170
}
171171

172+
static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
173+
const QueueImplPtr &Queue) {
174+
for (auto &EventImpl : EventImpls) {
175+
EventImpl->flushIfNeeded(Queue);
176+
}
177+
}
178+
172179
class DispatchHostTask {
173180
ExecCGCommand *MThisCmd;
174181
std::vector<interop_handle::ReqToMem> MReqToMem;
@@ -325,6 +332,7 @@ void Command::waitForEvents(QueueImplPtr Queue,
325332
#endif
326333

327334
std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
335+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
328336
const detail::plugin &Plugin = Queue->getPlugin();
329337
Plugin.call<PiApiKind::piEnqueueEventsWait>(
330338
Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
@@ -1073,6 +1081,7 @@ cl_int MapMemObject::enqueueImp() {
10731081
waitForPreparedHostEvents();
10741082
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
10751083
std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1084+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
10761085

10771086
RT::PiEvent &Event = MEvent->getHandleRef();
10781087
*MDstPtr = MemoryManager::map(
@@ -1150,6 +1159,7 @@ cl_int UnMapMemObject::enqueueImp() {
11501159
waitForPreparedHostEvents();
11511160
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
11521161
std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
1162+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
11531163

11541164
RT::PiEvent &Event = MEvent->getHandleRef();
11551165
MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
@@ -1250,6 +1260,7 @@ cl_int MemCpyCommand::enqueueImp() {
12501260
RT::PiEvent &Event = MEvent->getHandleRef();
12511261

12521262
auto RawEvents = getPiEvents(EventImpls);
1263+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
12531264

12541265
MemoryManager::copy(
12551266
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
@@ -1400,6 +1411,7 @@ cl_int MemCpyCommandHost::enqueueImp() {
14001411
return CL_SUCCESS;
14011412
}
14021413

1414+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
14031415
MemoryManager::copy(
14041416
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
14051417
MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
@@ -1989,6 +2001,7 @@ cl_int ExecCGCommand::enqueueImp() {
19892001
waitForPreparedHostEvents();
19902002
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
19912003
auto RawEvents = getPiEvents(EventImpls);
2004+
flushCrossQueueDeps(EventImpls, getWorkerQueue());
19922005

19932006
RT::PiEvent &Event = MEvent->getHandleRef();
19942007

sycl/test/abi/pi_level_zero_symbol_check.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ piProgramRelease
7070
piProgramRetain
7171
piQueueCreate
7272
piQueueFinish
73+
piQueueFlush
7374
piQueueGetInfo
7475
piQueueRelease
7576
piQueueRetain

sycl/unittests/helpers/CommonRedefinitions.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,18 @@ inline pi_result redefinedEventsWaitCommon(pi_uint32 num_events,
114114
return PI_SUCCESS;
115115
}
116116

117+
inline pi_result redefinedEventGetInfoCommon(pi_event event,
118+
pi_event_info param_name,
119+
size_t param_value_size,
120+
void *param_value,
121+
size_t *param_value_size_ret) {
122+
if (param_name == PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) {
123+
auto *status = reinterpret_cast<pi_event_status *>(param_value);
124+
*status = PI_EVENT_SUBMITTED;
125+
}
126+
return PI_SUCCESS;
127+
}
128+
117129
inline pi_result redefinedEventReleaseCommon(pi_event event) {
118130
if (event != nullptr)
119131
delete reinterpret_cast<int *>(event);
@@ -166,6 +178,7 @@ inline void setupDefaultMockAPIs(sycl::unittest::PiMock &Mock) {
166178
Mock.redefine<PiApiKind::piKernelSetExecInfo>(
167179
redefinedKernelSetExecInfoCommon);
168180
Mock.redefine<PiApiKind::piEventsWait>(redefinedEventsWaitCommon);
181+
Mock.redefine<PiApiKind::piEventGetInfo>(redefinedEventGetInfoCommon);
169182
Mock.redefine<PiApiKind::piEventRelease>(redefinedEventReleaseCommon);
170183
Mock.redefine<PiApiKind::piEnqueueKernelLaunch>(
171184
redefinedEnqueueKernelLaunchCommon);

sycl/unittests/scheduler/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@ add_sycl_unittest(SchedulerTests OBJECT
1515
InOrderQueueHostTaskDeps.cpp
1616
AllocaLinking.cpp
1717
RequiredWGSize.cpp
18+
QueueFlushing.cpp
1819
utils.cpp
1920
)

0 commit comments

Comments
 (0)