Skip to content

[SYCL] Implement queue flushing #5052

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Dec 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sycl/include/CL/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ _PI_API(piextContextCreateWithNativeHandle)
_PI_API(piQueueCreate)
_PI_API(piQueueGetInfo)
_PI_API(piQueueFinish)
_PI_API(piQueueFlush)
_PI_API(piQueueRetain)
_PI_API(piQueueRelease)
_PI_API(piextQueueGetNativeHandle)
Expand Down
9 changes: 6 additions & 3 deletions sycl/include/CL/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,12 @@
// changes the API version from 3.5 to 4.6.
// 5.7 Added new context and ownership arguments to
// piextEventCreateWithNativeHandle
// 6.8 Added new ownership argument to piextProgramCreateWithNativeHandle.
// 6.8 Added new ownership argument to piextProgramCreateWithNativeHandle. Added
// piQueueFlush function.
//
#include "CL/cl.h"
#define _PI_H_VERSION_MAJOR 5
#define _PI_H_VERSION_MINOR 7
#define _PI_H_VERSION_MAJOR 6
#define _PI_H_VERSION_MINOR 8

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -1082,6 +1083,8 @@ __SYCL_EXPORT pi_result piQueueRelease(pi_queue command_queue);

__SYCL_EXPORT pi_result piQueueFinish(pi_queue command_queue);

__SYCL_EXPORT pi_result piQueueFlush(pi_queue command_queue);

/// Gets the native handle of a PI queue object.
///
/// \param queue is the PI queue to get the native handle of.
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2254,6 +2254,11 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) {
return result;
}

// There is no CUDA counterpart for queue flushing and we don't run into the
// same problem of having to flush cross-queue dependencies as some of the
// other plugins, so it can be left as no-op.
pi_result cuda_piQueueFlush(pi_queue command_queue) { return PI_SUCCESS; }

/// Gets the native CUDA handle of a PI queue object
///
/// \param[in] queue The PI queue to get the native CUDA object of.
Expand Down Expand Up @@ -4886,6 +4891,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piQueueCreate, cuda_piQueueCreate)
_PI_CL(piQueueGetInfo, cuda_piQueueGetInfo)
_PI_CL(piQueueFinish, cuda_piQueueFinish)
_PI_CL(piQueueFlush, cuda_piQueueFlush)
_PI_CL(piQueueRetain, cuda_piQueueRetain)
_PI_CL(piQueueRelease, cuda_piQueueRelease)
_PI_CL(piextQueueGetNativeHandle, cuda_piextQueueGetNativeHandle)
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,11 @@ pi_result hip_piQueueFinish(pi_queue command_queue) {
return result;
}

// There is no HIP counterpart for queue flushing and we don't run into the
// same problem of having to flush cross-queue dependencies as some of the
// other plugins, so it can be left as no-op.
pi_result hip_piQueueFlush(pi_queue command_queue) { return PI_SUCCESS; }

/// Gets the native HIP handle of a PI queue object
///
/// \param[in] queue The PI queue to get the native HIP object of.
Expand Down Expand Up @@ -4820,6 +4825,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piQueueCreate, hip_piQueueCreate)
_PI_CL(piQueueGetInfo, hip_piQueueGetInfo)
_PI_CL(piQueueFinish, hip_piQueueFinish)
_PI_CL(piQueueFlush, hip_piQueueFlush)
_PI_CL(piQueueRetain, hip_piQueueRetain)
_PI_CL(piQueueRelease, hip_piQueueRelease)
_PI_CL(piextQueueGetNativeHandle, hip_piextQueueGetNativeHandle)
Expand Down
4 changes: 4 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2999,6 +2999,10 @@ pi_result piQueueFinish(pi_queue Queue) {
return PI_SUCCESS;
}

// Flushing cross-queue dependencies is covered by createAndRetainPiZeEventList,
// so this can be left as a no-op.
pi_result piQueueFlush(pi_queue Queue) { return PI_SUCCESS; }

pi_result piextQueueGetNativeHandle(pi_queue Queue,
pi_native_handle *NativeHandle) {
PI_ASSERT(Queue, PI_INVALID_QUEUE);
Expand Down
1 change: 1 addition & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1378,6 +1378,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piQueueCreate, piQueueCreate)
_PI_CL(piQueueGetInfo, clGetCommandQueueInfo)
_PI_CL(piQueueFinish, clFinish)
_PI_CL(piQueueFlush, clFlush)
_PI_CL(piQueueRetain, clRetainCommandQueue)
_PI_CL(piQueueRelease, clReleaseCommandQueue)
_PI_CL(piextQueueGetNativeHandle, piextQueueGetNativeHandle)
Expand Down
33 changes: 30 additions & 3 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,12 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
MState = HES_NotComplete;
}

event_impl::event_impl() : MState(HES_Complete) {}
event_impl::event_impl() : MIsFlushed(true), MState(HES_Complete) {}

event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
: MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) {
MOpenCLInterop(true), MHostEvent(false), MIsFlushed(true),
MState(HES_Complete) {

if (MContext->is_host()) {
throw cl::sycl::invalid_parameter_error(
Expand All @@ -120,7 +121,7 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
getPlugin().call<PiApiKind::piEventRetain>(MEvent);
}

event_impl::event_impl(QueueImplPtr Queue) {
event_impl::event_impl(const QueueImplPtr &Queue) : MQueue{Queue} {
if (Queue->is_host()) {
MState.store(HES_NotComplete);

Expand Down Expand Up @@ -344,6 +345,32 @@ std::vector<EventImplPtr> event_impl::getWaitList() {
return Result;
}

void event_impl::flushIfNeeded(const QueueImplPtr &UserQueue) {
assert(MEvent != nullptr);
if (MIsFlushed)
return;

QueueImplPtr Queue = MQueue.lock();
// If the queue has been released, all of the commands have already been
// implicitly flushed by piQueueRelease.
if (!Queue) {
MIsFlushed = true;
return;
}
if (Queue == UserQueue)
return;

// Check if the task for this event has already been submitted.
pi_event_status Status = PI_EVENT_QUEUED;
getPlugin().call<PiApiKind::piEventGetInfo>(
MEvent, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS, sizeof(pi_int32), &Status,
nullptr);
if (Status == PI_EVENT_QUEUED) {
getPlugin().call<PiApiKind::piQueueFlush>(Queue->getHandleRef());
}
MIsFlushed = true;
}

void event_impl::cleanupDependencyEvents() {
std::lock_guard<std::mutex> Lock(MMutex);
MPreparedDepsEvents.clear();
Expand Down
12 changes: 11 additions & 1 deletion sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class event_impl {
/// \param Event is a valid instance of plug-in event.
/// \param SyclContext is an instance of SYCL context.
event_impl(RT::PiEvent Event, const context &SyclContext);
event_impl(QueueImplPtr Queue);
event_impl(const QueueImplPtr &Queue);

/// Checks if this event is a SYCL host event.
///
Expand Down Expand Up @@ -182,6 +182,11 @@ class event_impl {
/// @return a vector of "immediate" dependencies for this event_impl.
std::vector<EventImplPtr> getWaitList();

/// Performs a flush on the queue associated with this event if the user queue
/// is different and the task associated with this event hasn't been submitted
/// to the device yet.
void flushIfNeeded(const QueueImplPtr &UserQueue);

/// Cleans dependencies of this event_impl
void cleanupDependencyEvents();

Expand All @@ -200,11 +205,16 @@ class event_impl {
bool MHostEvent = true;
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
void *MCommand = nullptr;
std::weak_ptr<queue_impl> MQueue;

/// Dependency events prepared for waiting by backend.
std::vector<EventImplPtr> MPreparedDepsEvents;
std::vector<EventImplPtr> MPreparedHostDepsEvents;

/// Indicates that the task associated with this event has been submitted by
/// the queue to the device.
std::atomic<bool> MIsFlushed = false;

enum HostEventState : int { HES_NotComplete = 0, HES_Complete };

// State of host event. Employed only for host events and event with no
Expand Down
13 changes: 13 additions & 0 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,13 @@ getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
return RetPiEvents;
}

static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
const QueueImplPtr &Queue) {
for (auto &EventImpl : EventImpls) {
EventImpl->flushIfNeeded(Queue);
}
}

class DispatchHostTask {
ExecCGCommand *MThisCmd;
std::vector<interop_handle::ReqToMem> MReqToMem;
Expand Down Expand Up @@ -325,6 +332,7 @@ void Command::waitForEvents(QueueImplPtr Queue,
#endif

std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());
const detail::plugin &Plugin = Queue->getPlugin();
Plugin.call<PiApiKind::piEnqueueEventsWait>(
Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
Expand Down Expand Up @@ -1073,6 +1081,7 @@ cl_int MapMemObject::enqueueImp() {
waitForPreparedHostEvents();
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());

RT::PiEvent &Event = MEvent->getHandleRef();
*MDstPtr = MemoryManager::map(
Expand Down Expand Up @@ -1150,6 +1159,7 @@ cl_int UnMapMemObject::enqueueImp() {
waitForPreparedHostEvents();
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());

RT::PiEvent &Event = MEvent->getHandleRef();
MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
Expand Down Expand Up @@ -1250,6 +1260,7 @@ cl_int MemCpyCommand::enqueueImp() {
RT::PiEvent &Event = MEvent->getHandleRef();

auto RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());

MemoryManager::copy(
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
Expand Down Expand Up @@ -1400,6 +1411,7 @@ cl_int MemCpyCommandHost::enqueueImp() {
return CL_SUCCESS;
}

flushCrossQueueDeps(EventImpls, getWorkerQueue());
MemoryManager::copy(
MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
Expand Down Expand Up @@ -1989,6 +2001,7 @@ cl_int ExecCGCommand::enqueueImp() {
waitForPreparedHostEvents();
std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
auto RawEvents = getPiEvents(EventImpls);
flushCrossQueueDeps(EventImpls, getWorkerQueue());

RT::PiEvent &Event = MEvent->getHandleRef();

Expand Down
1 change: 1 addition & 0 deletions sycl/test/abi/pi_level_zero_symbol_check.dump
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ piProgramRelease
piProgramRetain
piQueueCreate
piQueueFinish
piQueueFlush
piQueueGetInfo
piQueueRelease
piQueueRetain
Expand Down
13 changes: 13 additions & 0 deletions sycl/unittests/helpers/CommonRedefinitions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ inline pi_result redefinedEventsWaitCommon(pi_uint32 num_events,
return PI_SUCCESS;
}

inline pi_result redefinedEventGetInfoCommon(pi_event event,
pi_event_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
if (param_name == PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) {
auto *status = reinterpret_cast<pi_event_status *>(param_value);
*status = PI_EVENT_SUBMITTED;
}
return PI_SUCCESS;
}

inline pi_result redefinedEventReleaseCommon(pi_event event) {
if (event != nullptr)
delete reinterpret_cast<int *>(event);
Expand Down Expand Up @@ -166,6 +178,7 @@ inline void setupDefaultMockAPIs(sycl::unittest::PiMock &Mock) {
Mock.redefine<PiApiKind::piKernelSetExecInfo>(
redefinedKernelSetExecInfoCommon);
Mock.redefine<PiApiKind::piEventsWait>(redefinedEventsWaitCommon);
Mock.redefine<PiApiKind::piEventGetInfo>(redefinedEventGetInfoCommon);
Mock.redefine<PiApiKind::piEventRelease>(redefinedEventReleaseCommon);
Mock.redefine<PiApiKind::piEnqueueKernelLaunch>(
redefinedEnqueueKernelLaunchCommon);
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/scheduler/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ add_sycl_unittest(SchedulerTests OBJECT
InOrderQueueHostTaskDeps.cpp
AllocaLinking.cpp
RequiredWGSize.cpp
QueueFlushing.cpp
utils.cpp
)
Loading