Skip to content

[SYCL][CUDA] Fixes for multiple backends in the same program #1252

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 49 additions & 43 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ pi_result _pi_event::start() {
}

isStarted_ = true;
// let observers know that the event is "submitted"
trigger_callback(get_execution_status());
return result;
}

Expand Down Expand Up @@ -195,6 +197,22 @@ pi_result _pi_event::record() {

try {
result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream));

result = cuda_piEventRetain(this);
try {
result = PI_CHECK_ERROR(cuLaunchHostFunc(
cuStream,
[](void *userData) {
pi_event event = reinterpret_cast<pi_event>(userData);
event->set_event_complete();
cuda_piEventRelease(event);
},
this));
} catch (...) {
// If host function fails to enqueue we must release the event here
result = cuda_piEventRelease(this);
throw;
}
} catch (pi_result error) {
result = error;
}
Expand All @@ -215,6 +233,7 @@ pi_result _pi_event::wait() {
if (is_native_event()) {
try {
retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
isCompleted_ = true;
} catch (pi_result error) {
retErr = error;
}
Expand All @@ -226,30 +245,12 @@ pi_result _pi_event::wait() {
retErr = PI_SUCCESS;
}

return retErr;
}

pi_event_status _pi_event::get_execution_status() const noexcept {
auto is_success = retErr == PI_SUCCESS;
auto status = is_success ? get_execution_status() : pi_int32(retErr);

if (!is_recorded()) {
return PI_EVENT_SUBMITTED;
}

if (is_native_event()) {
// native event status

auto status = cuEventQuery(get());
if (status == CUDA_ERROR_NOT_READY) {
return PI_EVENT_RUNNING;
} else if (status != CUDA_SUCCESS) {
cl::sycl::detail::pi::die("Invalid CUDA event status");
}
return PI_EVENT_COMPLETE;
} else {
// user event status
trigger_callback(status);

return is_completed() ? PI_EVENT_COMPLETE : PI_EVENT_RUNNING;
}
return retErr;
}

// iterates over the event wait list, returns correct pi_result error codes.
Expand Down Expand Up @@ -2516,24 +2517,21 @@ pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,

switch (param_name) {
case PI_EVENT_INFO_COMMAND_QUEUE:
return getInfo<pi_queue>(param_value_size, param_value,
param_value_size_ret, event->get_queue());
return getInfo(param_value_size, param_value, param_value_size_ret,
event->get_queue());
case PI_EVENT_INFO_COMMAND_TYPE:
return getInfo<pi_command_type>(param_value_size, param_value,
param_value_size_ret,
event->get_command_type());
return getInfo(param_value_size, param_value, param_value_size_ret,
event->get_command_type());
case PI_EVENT_INFO_REFERENCE_COUNT:
return getInfo<pi_uint32>(param_value_size, param_value,
param_value_size_ret,
event->get_reference_count());
return getInfo(param_value_size, param_value, param_value_size_ret,
event->get_reference_count());
case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
return getInfo<pi_event_status>(param_value_size, param_value,
param_value_size_ret,
event->get_execution_status());
return getInfo(param_value_size, param_value, param_value_size_ret,
static_cast<pi_event_status>(event->get_execution_status()));
}
case PI_EVENT_INFO_CONTEXT:
return getInfo<pi_context>(param_value_size, param_value,
param_value_size_ret, event->get_context());
return getInfo(param_value_size, param_value, param_value_size_ret,
event->get_context());
default:
PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
}
Expand Down Expand Up @@ -2568,13 +2566,21 @@ pi_result cuda_piEventGetProfilingInfo(
return {};
}

pi_result cuda_piEventSetCallback(
pi_event event, pi_int32 command_exec_callback_type,
void (*pfn_notify)(pi_event event, pi_int32 event_command_status,
void *user_data),
void *user_data) {
cl::sycl::detail::pi::die("cuda_piEventSetCallback not implemented");
return {};
pi_result cuda_piEventSetCallback(pi_event event,
pi_int32 command_exec_callback_type,
pfn_notify notify, void *user_data) {

assert(event);
assert(notify);
assert(command_exec_callback_type == PI_EVENT_SUBMITTED ||
command_exec_callback_type == PI_EVENT_RUNNING ||
command_exec_callback_type == PI_EVENT_COMPLETE);
event_callback callback(pi_event_status(command_exec_callback_type), notify,
user_data);

event->set_event_callback(callback);

return PI_SUCCESS;
}

pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
Expand All @@ -2587,7 +2593,7 @@ pi_result cuda_piEventSetStatus(pi_event event, pi_int32 execution_status) {
}

if (execution_status == PI_EVENT_COMPLETE) {
return event->set_user_event_complete();
return event->set_event_complete();
} else if (execution_status < 0) {
// TODO: A negative integer value causes all enqueued commands that wait
// on this user event to be terminated.
Expand Down
94 changes: 87 additions & 7 deletions sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,39 @@ struct _pi_queue {
pi_uint32 get_reference_count() const noexcept { return refCount_; }
};

typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
void *userData);

class event_callback {
public:
void trigger_callback(pi_event event, pi_int32 currentEventStatus) const {

auto validParameters = callback_ && event;

// As a pi_event_status value approaches 0, it gets closer to completion.
// If the calling pi_event's status is less than or equal to the event
// status the user is interested in, invoke the callback anyway. The event
// will have passed through that state anyway.
auto validStatus = currentEventStatus <= observedEventStatus_;

if (validParameters && validStatus) {

callback_(event, currentEventStatus, userData_);
}
}

event_callback(pi_event_status status, pfn_notify callback, void *userData)
: observedEventStatus_{status}, callback_{callback}, userData_{userData} {
}

pi_event_status get_status() const noexcept { return observedEventStatus_; }

private:
pi_event_status observedEventStatus_;
pfn_notify callback_;
void *userData_;
};

class _pi_event {
public:
using native_type = CUevent;
Expand All @@ -247,18 +280,39 @@ class _pi_event {

native_type get() const noexcept { return evEnd_; };

pi_result set_user_event_complete() noexcept {
pi_result set_event_complete() noexcept {

if (isCompleted_) {
return PI_INVALID_OPERATION;
}

if (is_user_event()) {
isRecorded_ = true;
isCompleted_ = true;
return PI_SUCCESS;
isRecorded_ = true;
isCompleted_ = true;

trigger_callback(get_execution_status());

return PI_SUCCESS;
}

void trigger_callback(pi_int32 status) {

std::vector<event_callback> callbacks;

// Here we move all callbacks into local variable before we call them.
// This is a defensive maneuver; if any of the callbacks attempt to
// add additional callbacks, we will end up in a bad spot. Our mutex
// will be locked twice and the vector will be modified as it is being
// iterated over! By moving everything locally, we can call all of these
// callbacks and let them modify the original vector without much worry.

{
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.swap(callbacks);
}

for (auto &event_callback : callbacks) {
event_callback.trigger_callback(this, status);
}
return PI_INVALID_EVENT;
}

pi_queue get_queue() const noexcept { return queue_; }
Expand All @@ -273,7 +327,27 @@ class _pi_event {

bool is_started() const noexcept { return isStarted_; }

pi_event_status get_execution_status() const noexcept;
pi_int32 get_execution_status() const noexcept {

if (!is_recorded()) {
return PI_EVENT_SUBMITTED;
}

if (!is_completed()) {
return PI_EVENT_RUNNING;
}
return PI_EVENT_COMPLETE;
}

void set_event_callback(const event_callback &callback) {
auto current_status = get_execution_status();
if (current_status <= callback.get_status()) {
callback.trigger_callback(this, current_status);
} else {
std::lock_guard<std::mutex> lock(mutex_);
event_callbacks_.emplace_back(callback);
}
}

pi_context get_context() const noexcept { return context_; };

Expand Down Expand Up @@ -343,6 +417,12 @@ class _pi_event {
pi_context context_; // pi_context associated with the event. If this is a
// native event, this will be the same context associated
// with the queue_ member.

std::mutex mutex_; // Protect access to event_callbacks_. TODO: There might be
// a lock-free data structure we can use here.
std::vector<event_callback>
event_callbacks_; // Callbacks that can be triggered when an event's state
// changes.
};

struct _pi_program {
Expand Down
33 changes: 19 additions & 14 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,45 +161,50 @@ void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
EventImplPtr *Event = (reinterpret_cast<EventImplPtr *>(data));
RT::PiEvent &EventHandle = (*Event)->getHandleRef();
const detail::plugin &Plugin = (*Event)->getPlugin();
Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, CL_COMPLETE);
Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, PI_EVENT_COMPLETE);
delete (Event);
}

// Method prepares PI event's from list sycl::event's
std::vector<EventImplPtr> Command::prepareEvents(ContextImplPtr Context) {
std::vector<EventImplPtr> Result;
std::vector<EventImplPtr> GlueEvents;
for (EventImplPtr &Event : MDepsEvents) {
for (EventImplPtr &DepEvent : MDepsEvents) {
// Async work is not supported for host device.
if (Event->is_host()) {
Event->waitInternal();
if (DepEvent->is_host()) {
DepEvent->waitInternal();
continue;
}
// The event handle can be null in case of, for example, alloca command,
// which is currently synchrounious, so don't generate OpenCL event.
if (Event->getHandleRef() == nullptr) {
if (DepEvent->getHandleRef() == nullptr) {
continue;
}
ContextImplPtr EventContext = Event->getContextImpl();
const detail::plugin &Plugin = Event->getPlugin();
// If contexts don't match - connect them using user event
if (EventContext != Context && !Context->is_host()) {
ContextImplPtr DepEventContext = DepEvent->getContextImpl();

// If contexts don't match - connect them using user event
if (DepEventContext != Context && !Context->is_host()) {
EventImplPtr GlueEvent(new detail::event_impl());
GlueEvent->setContextImpl(Context);
EventImplPtr *GlueEventCopy =
new EventImplPtr(GlueEvent); // To increase the reference count by 1.

RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef();
auto Plugin = Context->getPlugin();
auto DepPlugin = DepEventContext->getPlugin();
// Add an event on the current context that
// is triggered when the DepEvent is complete
Plugin.call<PiApiKind::piEventCreate>(Context->getHandleRef(),
&GlueEventHandle);
EventImplPtr *GlueEventCopy =
new EventImplPtr(GlueEvent); // To increase the reference count by 1.
Plugin.call<PiApiKind::piEventSetCallback>(
Event->getHandleRef(), CL_COMPLETE, EventCompletionClbk,

DepPlugin.call<PiApiKind::piEventSetCallback>(
DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk,
/*void *data=*/(GlueEventCopy));
GlueEvents.push_back(GlueEvent);
Result.push_back(std::move(GlueEvent));
continue;
}
Result.push_back(Event);
Result.push_back(DepEvent);
}
MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end());
return Result;
Expand Down
3 changes: 0 additions & 3 deletions sycl/test/basic_tests/buffer/buffer_dev_to_dev.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

// TODO: pi_die: cuda_piEventSetCallback not implemented
// XFAIL: cuda

//==---------- buffer_dev_to_dev.cpp - SYCL buffer basic test --------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
Expand Down
1 change: 0 additions & 1 deletion sycl/test/scheduler/DataMovement.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
// RUN: %t.out
//
// XFAIL: cuda
//==-------------------------- DataMovement.cpp ----------------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
Expand Down
3 changes: 0 additions & 3 deletions sycl/test/scheduler/MultipleDevices.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -I %sycl_source_dir %s -o %t.out
// RUN: %t.out

// TODO: pi_die: cuda_piEventSetCallback not implemented
// XFAIL: cuda

//===- MultipleDevices.cpp - Test checking multi-device execution --------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
Expand Down
Loading