Skip to content

[SYCL] Add nested calls detection to shortcut functions #13659

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ using IsReduOptForFastAtomicFetch =
std::bool_constant<false>;
#else
std::bool_constant<((is_sgenfloat_v<T> && sizeof(T) == 4) ||
is_sgeninteger_v<T>)&&IsValidAtomicType<T>::value &&
is_sgeninteger_v<T>) &&
IsValidAtomicType<T>::value &&
(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value ||
Expand Down Expand Up @@ -138,11 +139,12 @@ using IsReduOptForFastReduce =
#ifdef SYCL_REDUCTION_DETERMINISTIC
std::bool_constant<false>;
#else
std::bool_constant<(
(is_sgeninteger_v<T> && (sizeof(T) == 4 || sizeof(T) == 8)) ||
is_sgenfloat_v<T>)&&(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value)>;
std::bool_constant<((is_sgeninteger_v<T> &&
(sizeof(T) == 4 || sizeof(T) == 8)) ||
is_sgenfloat_v<T>) &&
(IsPlus<T, BinaryOperation>::value ||
IsMinimum<T, BinaryOperation>::value ||
IsMaximum<T, BinaryOperation>::value)>;
#endif

// std::tuple seems to be a) too heavy and b) not copyable to device now
Expand Down Expand Up @@ -835,6 +837,10 @@ using __sycl_init_mem_for =
std::conditional_t<std::is_same_v<KernelName, auto_name>, auto_name,
reduction::InitMemKrn<KernelName>>;

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter);

template <typename T, class BinaryOperation, int Dims, size_t Extent,
bool ExplicitIdentity, typename RedOutVar>
class reduction_impl_algo {
Expand Down Expand Up @@ -1075,8 +1081,7 @@ class reduction_impl_algo {
std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
CGH.addReduction(Counter);

auto Event = q.memset(Counter.get(), 0, sizeof(int));
CGH.depends_on(Event);
addCounterInit(CGH, CGH.MQueue, Counter);

return Counter.get();
}
Expand Down
62 changes: 60 additions & 2 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ namespace sycl {
inline namespace _V1 {
namespace detail {
std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 0;
thread_local bool NestedCallsDetector = false;
class NestedCallsTracker {
public:
NestedCallsTracker() {
if (NestedCallsDetector)
throw sycl::exception(
make_error_code(errc::invalid),
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead.");
NestedCallsDetector = true;
}

~NestedCallsTracker() { NestedCallsDetector = false; }
};

static std::vector<sycl::detail::pi::PiEvent>
getPIEvents(const std::vector<sycl::event> &DepEvents) {
Expand Down Expand Up @@ -330,6 +344,46 @@ void queue_impl::addSharedEvent(const event &Event) {
MEventsShared.push_back(Event);
}

event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
const std::shared_ptr<queue_impl> &Self,
const std::shared_ptr<queue_impl> &PrimaryQueue,
const std::shared_ptr<queue_impl> &SecondaryQueue,
const detail::code_location &Loc,
const SubmitPostProcessF *PostProcess) {
handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);

{
NestedCallsTracker tracker;
CGF(Handler);
}

// Scheduler will later omit events, that are not required to execute tasks.
// Host and interop tasks, however, are not submitted to low-level runtimes
// and require separate dependency management.
const CG::CGTYPE Type = Handler.getType();
event Event = detail::createSyclObjFromImpl<event>(
std::make_shared<detail::event_impl>());

if (PostProcess) {
bool IsKernel = Type == CG::Kernel;
bool KernelUsesAssert = false;

if (IsKernel)
// Kernel only uses assert if it's non interop one
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
ProgramManager::getInstance().kernelUsesAssert(
Handler.MKernelName.c_str());
finalizeHandler(Handler, Event);

(*PostProcess)(IsKernel, KernelUsesAssert, Event);
} else
finalizeHandler(Handler, Event);

addEvent(Event);
return Event;
}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
const std::vector<event> &DepEvents,
Expand Down Expand Up @@ -362,15 +416,19 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
ExpandedDepEvents, MContext)) {
if (MSupportsDiscardingPiEvents) {
NestedCallsTracker tracker;
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
return createDiscardedEvent();
}

event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
auto EventImpl = detail::getSyclObjImpl(ResEvent);
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
&EventImpl->getHandleRef(), EventImpl);
{
NestedCallsTracker tracker;
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
&EventImpl->getHandleRef(), EventImpl);
}

if (MContext->is_host())
return MDiscardEvents ? createDiscardedEvent() : event();
Expand Down
48 changes: 1 addition & 47 deletions sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,53 +849,7 @@ class queue_impl {
const std::shared_ptr<queue_impl> &PrimaryQueue,
const std::shared_ptr<queue_impl> &SecondaryQueue,
const detail::code_location &Loc,
const SubmitPostProcessF *PostProcess) {
// Flag used to detect nested calls to submit and report an error.
thread_local static bool PreventSubmit = false;

if (PreventSubmit) {
throw sycl::exception(
make_error_code(errc::invalid),
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead.");
}

handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
Handler.saveCodeLoc(Loc);
PreventSubmit = true;
try {
CGF(Handler);
} catch (...) {
PreventSubmit = false;
throw;
}
PreventSubmit = false;

// Scheduler will later omit events, that are not required to execute tasks.
// Host and interop tasks, however, are not submitted to low-level runtimes
// and require separate dependency management.
const CG::CGTYPE Type = Handler.getType();
event Event = detail::createSyclObjFromImpl<event>(
std::make_shared<detail::event_impl>());

if (PostProcess) {
bool IsKernel = Type == CG::Kernel;
bool KernelUsesAssert = false;

if (IsKernel)
// Kernel only uses assert if it's non interop one
KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
ProgramManager::getInstance().kernelUsesAssert(
Handler.MKernelName.c_str());
finalizeHandler(Handler, Event);

(*PostProcess)(IsKernel, KernelUsesAssert, Event);
} else
finalizeHandler(Handler, Event);

addEvent(Event);
return Event;
}
const SubmitPostProcessF *PostProcess);

/// Helper function for submitting a memory operation with a handler.
/// \param Self is a shared_ptr to this queue.
Expand Down
12 changes: 12 additions & 0 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//

#include <detail/config.hpp>
#include <detail/memory_manager.hpp>
#include <detail/queue_impl.hpp>
#include <sycl/reduction.hpp>

Expand Down Expand Up @@ -165,6 +166,17 @@ __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
return reduGetMaxWGSize(Queue, LocalMemBytesPerWorkItem);
}

__SYCL_EXPORT void
addCounterInit(handler &CGH, std::shared_ptr<sycl::detail::queue_impl> &Queue,
std::shared_ptr<int> &Counter) {
auto EventImpl = std::make_shared<detail::event_impl>(Queue);
EventImpl->setContextImpl(detail::getSyclObjImpl(Queue->get_context()));
EventImpl->setStateIncomplete();
MemoryManager::fill_usm(Counter.get(), Queue, sizeof(int), 0, {},
&EventImpl->getHandleRef(), EventImpl);
CGH.depends_on(createSyclObjFromImpl<event>(EventImpl));
}

} // namespace detail
} // namespace _V1
} // namespace sycl
55 changes: 44 additions & 11 deletions sycl/test-e2e/Basic/nested_queue_submit.cpp
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

#include <cstdlib>
#include <string>
#include <sycl/detail/core.hpp>
#include <sycl/usm.hpp>
#include <vector>

void nestedSubmit() {
void checkExceptionFields(const sycl::exception &e) {
assert(e.code() == sycl::errc::invalid && "Invalid error code");
assert(std::string(e.what()) ==
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead." &&
"Invalid e.what() string");
}

void nestedSubmitParallelFor(sycl::queue &q) {
uint32_t n = 1024;
std::vector<float> array(n);
sycl::queue q{};
{
sycl::buffer<float> buf(array.data(), sycl::range<1>{n});
q.submit([&](sycl::handler &h) {
Expand All @@ -19,16 +27,41 @@ void nestedSubmit() {
}
}

int main() {
void nestedSubmitMemset(sycl::queue &q) {
uint32_t n = 1024;
int *data = sycl::malloc_device<int>(n, q);
try {
q.submit([&](sycl::handler &h) { q.memset(data, 0, n * sizeof(int)); });
} catch (...) {
sycl::free(data, q);
throw;
}
sycl::free(data, q);
}

template <typename CommandSubmitterT>
void test(sycl::queue &Queue, CommandSubmitterT QueueSubmit) {
bool ExceptionHappened = false;
try {
nestedSubmit();
QueueSubmit(Queue);
} catch (const sycl::exception &e) {
assert(e.code() == sycl::errc::invalid && "Invalid error code");
assert(std::string(e.what()) ==
"Calls to sycl::queue::submit cannot be nested. Command group "
"function objects should use the sycl::handler API instead." &&
"Invalid e.what() string");
checkExceptionFields(e);
ExceptionHappened = true;
}
std::cout << "test passed" << std::endl;
assert(ExceptionHappened);
// Checks that queue is in a valid state: nested call tracker was cleaned up
// after exception and does not prevent from submission of new commands.
Queue.submit([&](sycl::handler &h) {});
Queue.wait();
}

int main() {
sycl::queue q{};
test(q, nestedSubmitParallelFor);
// All shortcut functions has a common part where nested call detection
// happens. Testing only one of them is enough.
if (q.get_device().get_info<sycl::info::device::usm_device_allocations>())
test(q, nestedSubmitMemset);

return EXIT_SUCCESS;
}
1 change: 1 addition & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3314,6 +3314,7 @@ _ZN4sycl3_V16detail13lgamma_r_implEfPi
_ZN4sycl3_V16detail13make_platformEmNS0_7backendE
_ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEE
_ZN4sycl3_V16detail13select_deviceERKSt8functionIFiRKNS0_6deviceEEERKNS0_7contextE
_ZN4sycl3_V16detail14addCounterInitERNS0_7handlerERSt10shared_ptrINS1_10queue_implEERS4_IiE
_ZN4sycl3_V16detail14getBorderColorENS0_19image_channel_orderE
_ZN4sycl3_V16detail14tls_code_loc_t5queryEv
_ZN4sycl3_V16detail14tls_code_loc_tC1ERKNS1_13code_locationE
Expand Down
13 changes: 7 additions & 6 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,12 @@
??$import_external_semaphore@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z
??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z
??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z
??0AccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VAccessorImplHost@detail@_V1@sycl@@@std@@@Z
??0AccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z
??0AccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z
Expand Down Expand Up @@ -3932,6 +3938,7 @@
?add@device_global_map@detail@_V1@sycl@@YAXPEBXPEBD@Z
?add@host_pipe_map@detail@_V1@sycl@@YAXPEBXPEBD@Z
?add@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA?AVnode@34567@AEBVproperty_list@67@@Z
?addCounterInit@detail@_V1@sycl@@YAXAEAVhandler@23@AEAV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@AEAV?$shared_ptr@H@6@@Z
?addGraphLeafDependencies@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAAXVnode@34567@@Z
?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z
?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z
Expand Down Expand Up @@ -4355,12 +4362,6 @@
?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z
?get_name@kernel_id@_V1@sycl@@QEBAPEBDXZ
?get_node_from_event@node@experimental@oneapi@ext@_V1@sycl@@SA?AV123456@Vevent@56@@Z
??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
??$update_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$00@45@@Z
??$update_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$01@45@@Z
??$update_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$range@$02@45@@Z
?get_nodes@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
?get_num_channels@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAIXZ
?get_pipe_name@pipe_base@experimental@intel@ext@_V1@sycl@@KA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEBX@Z
Expand Down
Loading