Skip to content

Commit 614c6ee

Browse files
authored
Refactor queue submission to resolve a hang (#377)
when fast kernel relaunching many dependent events in a row. This removes a circular dependence and recursion between acl_command_queue and acl_device_op and thus makes the code much simpler to reason about. All commands are now submitted by the acl_update_*_queue functions.
1 parent 9a3204e commit 614c6ee

File tree

3 files changed

+72
-100
lines changed

3 files changed

+72
-100
lines changed

src/acl_command_queue.cpp

Lines changed: 72 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -630,93 +630,60 @@ int acl_update_queue(cl_command_queue command_queue) {
630630
}
631631
}
632632

633-
void acl_try_FastKernelRelaunch_ooo_queue_event_dependents(cl_event parent) {
633+
// Try to submit a kernel even if it has unfinished dependences using fast
634+
// kernel relaunch
635+
// Returns true on success, false on failure
636+
bool acl_fast_relaunch_kernel(cl_event event) {
637+
if (!(event->command_queue->properties &
638+
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
639+
return false;
640+
641+
if (event->depend_on.size() != 1)
642+
return false;
643+
644+
cl_event parent = *(event->depend_on.begin());
645+
634646
if (!(parent->command_queue->properties &
635647
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
636-
return;
637-
if (parent->depend_on_me.empty())
638-
return;
648+
return false;
649+
639650
if (parent->cmd.type != CL_COMMAND_TASK &&
640651
parent->cmd.type != CL_COMMAND_NDRANGE_KERNEL)
641-
return;
652+
return false;
653+
642654
if (parent->execution_status > CL_SUBMITTED ||
643655
parent->last_device_op->status > CL_SUBMITTED)
644-
return;
645-
646-
// Check if fast kernel relaunch is safe to use, and we can ignore
647-
// the explicit dependency
648-
for (auto dependent_it = parent->depend_on_me.begin();
649-
dependent_it != parent->depend_on_me.end(); dependent_it++) {
650-
cl_event dependent = *dependent_it;
651-
// Currently we do not handle the case of FKR for mixed queue types
652-
if (!(dependent->command_queue->properties &
653-
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE))
654-
continue;
655-
// can only FKR if one unresolved dependency
656-
if (dependent->depend_on.size() > 1)
657-
continue;
658-
// can happen if this function gets called twice on same parent
659-
// once during submission and once during completion
660-
if (dependent->is_on_device_op_queue)
661-
continue;
662-
663-
if (!l_is_same_kernel_event(parent, dependent)) {
664-
// dependent on a different kernel than parent,
665-
// must wait for dependency to be resolved
666-
// OR the dependent is not on the same device,
667-
// not safe to preemptively push dependent to device_op_queue
668-
continue;
669-
}
670-
671-
// Special case: if subbuffers are present they may(!) cause a
672-
// migration while another kernel is using that data.
673-
if (acl_kernel_has_unmapped_subbuffers(
674-
&(dependent->cmd.info.ndrange_kernel.memory_migration))) {
675-
continue;
676-
}
677-
678-
// Fast Kernel Relaunch: submitting is safe even though has dependency
679-
// Prior to submitting remove dependency
680-
int local_updates = acl_submit_command(dependent);
681-
if (local_updates) {
682-
dependent->depend_on.erase(parent);
683-
dependent_it = parent->depend_on_me.erase(dependent_it);
684-
dependent_it--; // decrement it otherwise we will skip an element
685-
dependent->command_queue->num_commands_submitted++;
686-
}
687-
}
656+
return false;
657+
658+
if (!l_is_same_kernel_event(parent, event)) {
659+
// dependent on a different kernel than parent,
660+
// must wait for dependency to be resolved
661+
// OR the dependent is not on the same device,
662+
// not safe to preemptively push dependent to device_op_queue
663+
return false;
664+
}
665+
666+
// Special case: if subbuffers are present they may(!) cause a
667+
// migration while another kernel is using that data.
668+
if (acl_kernel_has_unmapped_subbuffers(
669+
&(event->cmd.info.ndrange_kernel.memory_migration)))
670+
return false;
671+
672+
// Fast Kernel Relaunch: submitting is safe even though has dependency
673+
// If submission succeeds, remove dependency
674+
bool success = acl_submit_command(event);
675+
if (!success)
676+
return false;
677+
event->depend_on.erase(parent);
678+
parent->depend_on_me.remove(event);
679+
return true;
688680
}
689681

690682
int acl_update_ooo_queue(cl_command_queue command_queue) {
691683
int num_updates = 0;
692684

693-
// Directly submit the event if it has no dependencies
694-
// unless it is a user_event queue which never submits events
695-
while (!command_queue->new_commands.empty()) {
696-
int success = 1;
697-
cl_event event = command_queue->new_commands.front();
698-
if (command_queue->submits_commands &&
699-
event->execution_status == CL_QUEUED) {
700-
if (event->depend_on.empty()) {
701-
command_queue->num_commands_submitted++;
702-
success = acl_submit_command(event);
703-
} else {
704-
// This is allowed to fail, so no need to mark success as false
705-
// dependent events that fail to be FKRd will still be picked up when
706-
// their parent event finishes
707-
acl_try_FastKernelRelaunch_ooo_queue_event_dependents(
708-
*(event->depend_on.begin()));
709-
}
710-
}
711-
712-
if (success) {
713-
// safe to pop as there is a master copy in command_queue->commands
714-
command_queue->new_commands.pop_front();
715-
}
716-
}
717-
718-
// Remove dependencies on completed events, and launch any events
719-
// that no longer have dependencies.
685+
// First, remove dependencies on completed events,
686+
// as this may unblock other evevnts
720687
// Completed events should be returned to the free pool
721688
while (!command_queue->completed_commands.empty()) {
722689
cl_event event = command_queue->completed_commands.front();
@@ -735,16 +702,6 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
735702
dependent->command_queue->completed_commands.push_back(
736703
dependent); // dependent might be on another queue
737704
}
738-
} else if (dependent->depend_on.empty()) {
739-
// dependent has no dependencies safe to submit if in OOO queue
740-
if ((dependent->command_queue->properties &
741-
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) &&
742-
dependent->cmd.type != CL_COMMAND_USER) {
743-
int local_updates = acl_submit_command(dependent);
744-
dependent->command_queue->num_commands_submitted +=
745-
local_updates; // dependent might be on another queue
746-
num_updates += local_updates;
747-
}
748705
}
749706
}
750707

@@ -772,10 +729,39 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
772729
command_queue->commands.erase(event);
773730
}
774731
event->not_popped = false;
732+
num_updates++;
775733
command_queue->num_commands--;
776734
acl_release(command_queue);
777735
}
778736

737+
// Next try to submit any events with no dependencies
738+
// or whose only dependences can be handled by fast kernel relaunch
739+
// unless they are on a user_event queue which never submits events
740+
for (auto event_iter = command_queue->new_commands.begin();
741+
event_iter != command_queue->new_commands.end();) {
742+
cl_event event = *event_iter;
743+
int success = 0;
744+
if (!command_queue->submits_commands)
745+
success = 1;
746+
else {
747+
if (event->depend_on.empty()) {
748+
success = acl_submit_command(event);
749+
} else {
750+
success = acl_fast_relaunch_kernel(event);
751+
}
752+
}
753+
754+
// Increment before removal so we don't invalidate the iterator
755+
event_iter++;
756+
if (success) {
757+
// num_commands_submitted isn't used for ooo queues today
758+
// but keep it up-to-date in case someone wants to use it in the future
759+
command_queue->num_commands_submitted++;
760+
command_queue->new_commands.remove(event);
761+
num_updates++;
762+
}
763+
}
764+
779765
return num_updates;
780766
}
781767

src/acl_device_op.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,10 +1474,6 @@ void acl_post_status_to_owning_event(acl_device_op_t *op, int new_status) {
14741474
command_queue->num_commands_submitted--;
14751475
} else {
14761476
event->timestamp[new_status] = op->timestamp[new_status];
1477-
1478-
if (command_queue->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
1479-
acl_try_FastKernelRelaunch_ooo_queue_event_dependents(event);
1480-
}
14811477
}
14821478
}
14831479

src/acl_event.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,16 +1078,6 @@ int acl_notify_dependent_events(cl_event event) {
10781078
if (event->cmd.type == CL_COMMAND_USER && event->execution_status < 0) {
10791079
acl_set_execution_status(dependent, event->execution_status);
10801080
}
1081-
1082-
// Submit the event if it has no dependencies and is partt of an
1083-
// Out-of-order queue
1084-
if (dependent->command_queue->properties &
1085-
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE &&
1086-
dependent->depend_on.empty() &&
1087-
dependent->cmd.type != CL_COMMAND_USER) {
1088-
dependent->command_queue->num_commands_submitted++;
1089-
acl_submit_command(dependent);
1090-
}
10911081
}
10921082

10931083
int num_updates = static_cast<int>(event->depend_on_me.size());

0 commit comments

Comments
 (0)