Skip to content

[UR][L0] Avoid calls to destroy interop data structures given loader instability #17543

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Mar 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions unified-runtime/source/adapters/level_zero/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@
#include <unordered_map>
#include <vector>

#ifdef _WIN32
#include "windows.h"
#else
#include <dlfcn.h>
#include <unistd.h>
#endif

#include <ur/ur.hpp>
#include <ur_ddi.h>
#include <ze_api.h>
Expand All @@ -30,6 +37,68 @@

struct _ur_platform_handle_t;

[[maybe_unused]] static bool checkL0LoaderTeardown() {
bool loaderStable = true;
#ifdef _WIN32
uint32_t ZeDriverCount = 0;
HMODULE zeLoader = LoadLibrary("ze_loader.dll");
if (zeLoader) {
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
zeDriverGet_t zeDriverGetLoader =
(zeDriverGet_t)GetProcAddress(zeLoader, "zeDriverGet");
if (zeDriverGetLoader) {
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
logger::debug(
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
ZeDriverCount);
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
loaderStable = false;
}
} else {
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
"zeDriverGet");
loaderStable = false;
}
FreeLibrary(zeLoader);
} else {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Failed to load ze_loader.dll");
loaderStable = false;
}
#else
uint32_t ZeDriverCount = 0;
void *zeLoader = dlopen("libze_loader.so.1", RTLD_LAZY);
if (zeLoader) {
typedef ze_result_t (*zeDriverGet_t)(uint32_t *, ze_driver_handle_t *);
zeDriverGet_t zeDriverGetLoader =
(zeDriverGet_t)dlsym(zeLoader, "zeDriverGet");
if (zeDriverGetLoader) {
ze_result_t result = zeDriverGetLoader(&ZeDriverCount, nullptr);
logger::debug(
"ZE ---> checkL0LoaderTeardown result = {} driver count = {}", result,
ZeDriverCount);
if (result != ZE_RESULT_SUCCESS || ZeDriverCount == 0) {
loaderStable = false;
}
} else {
logger::debug("ZE ---> checkL0LoaderTeardown: Failed to get address of "
"zeDriverGet");
loaderStable = false;
}
dlclose(zeLoader);
} else {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Failed to load libze_loader.so.1");
loaderStable = false;
}
#endif
if (!loaderStable) {
logger::debug(
"ZE ---> checkL0LoaderTeardown: Loader is not stable, returning false");
}
return loaderStable;
}

static auto getUrResultString = [](ur_result_t Result) {
switch (Result) {
case UR_RESULT_SUCCESS:
Expand Down Expand Up @@ -435,6 +504,9 @@ struct _ur_object {
// Indicates if we own the native handle or it came from interop that
// asked to not transfer the ownership to SYCL RT.
bool OwnNativeHandle = false;

// Indicates if this object is an interop handle.
bool IsInteropNativeHandle = false;
};

// Record for a memory allocation. This structure is used to keep information
Expand Down
18 changes: 13 additions & 5 deletions unified-runtime/source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ ur_result_t urContextCreateWithNativeHandle(
ur_context_handle_t_ *UrContext = new ur_context_handle_t_(
ZeContext, NumDevices, Devices, OwnNativeHandle);
UrContext->initialize();
UrContext->IsInteropNativeHandle = true;
*Context = reinterpret_cast<ur_context_handle_t>(UrContext);
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
Expand Down Expand Up @@ -262,7 +263,11 @@ ur_result_t ContextReleaseHelper(ur_context_handle_t Context) {
Contexts.erase(It);
}
ze_context_handle_t DestroyZeContext =
Context->OwnNativeHandle ? Context->ZeContext : nullptr;
((Context->OwnNativeHandle && !Context->IsInteropNativeHandle) ||
(Context->OwnNativeHandle && Context->IsInteropNativeHandle &&
checkL0LoaderTeardown()))
? Context->ZeContext
: nullptr;

// Clean up any live memory associated with Context
ur_result_t Result = Context->finalize();
Expand Down Expand Up @@ -299,11 +304,14 @@ ur_result_t ur_context_handle_t_::finalize() {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
for (auto &EventCache : EventCaches) {
for (auto &Event : EventCache) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
if (!Event->IsInteropNativeHandle ||
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
Event->ZeEvent = nullptr;
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
delete Event;
}
EventCache.clear();
Expand Down
1 change: 1 addition & 0 deletions unified-runtime/source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1416,6 +1416,7 @@ ur_result_t urDeviceCreateWithNativeHandle(
if (Dev == nullptr)
return UR_RESULT_ERROR_INVALID_VALUE;

Dev->IsInteropNativeHandle = true;
*Device = Dev;
return UR_RESULT_SUCCESS;
}
Expand Down
12 changes: 8 additions & 4 deletions unified-runtime/source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,7 @@ ur_result_t urEventCreateWithNativeHandle(
UREvent->CleanedUp = true;

*Event = reinterpret_cast<ur_event_handle_t>(UREvent);
UREvent->IsInteropNativeHandle = true;

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1116,11 +1117,14 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
}
if (Event->OwnNativeHandle) {
if (DisableEventsCaching) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
if (!Event->IsInteropNativeHandle ||
(Event->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
Event->ZeEvent = nullptr;
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
auto Context = Event->Context;
if (auto Res = Context->decrementUnreleasedEventsInPool(Event))
return Res;
Expand Down
12 changes: 8 additions & 4 deletions unified-runtime/source/adapters/level_zero/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -940,10 +940,13 @@ ur_result_t urKernelRelease(
auto KernelProgram = Kernel->Program;
if (Kernel->OwnNativeHandle) {
for (auto &ZeKernel : Kernel->ZeKernels) {
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (!Kernel->IsInteropNativeHandle ||
(Kernel->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(zeKernelDestroy, (ZeKernel));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
}
}
Kernel->ZeKernelMap.clear();
Expand Down Expand Up @@ -1154,6 +1157,7 @@ ur_result_t urKernelCreateWithNativeHandle(
}

Kernel->Program = Program;
Kernel->IsInteropNativeHandle = true;

UR_CALL(Kernel->initialize());

Expand Down
15 changes: 10 additions & 5 deletions unified-runtime/source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1563,6 +1563,7 @@ ur_result_t urMemImageCreateWithNativeHandle(
auto OwnNativeHandle = Properties ? Properties->isNativeHandleOwned : false;
UR_CALL(createUrMemFromZeImage(Context, ZeHImage, OwnNativeHandle,
ZeImageDesc, Mem));
(*Mem)->IsInteropNativeHandle = true;

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1662,11 +1663,14 @@ ur_result_t urMemRelease(
if (Image->OwnNativeHandle) {
UR_CALL(Mem->getZeHandle(ZeHandleImage, ur_mem_handle_t_::write_only,
nullptr, nullptr, 0u));
auto ZeResult = ZE_CALL_NOCHECK(
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (!Image->IsInteropNativeHandle ||
(Image->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(
zeImageDestroy, (ur_cast<ze_image_handle_t>(ZeHandleImage)));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
}
delete Image;
} else {
Expand Down Expand Up @@ -1772,6 +1776,7 @@ ur_result_t urMemBufferCreateWithNativeHandle(
Buffer = new _ur_buffer(Context, Size, Device, ur_cast<char *>(NativeMem),
OwnNativeHandle);
*Mem = reinterpret_cast<ur_mem_handle_t>(Buffer);
(*Mem)->IsInteropNativeHandle = true;
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down
14 changes: 7 additions & 7 deletions unified-runtime/source/adapters/level_zero/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,6 @@ ur_result_t urProgramCreateWithNativeHandle(
UR_ASSERT(Context && NativeProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
UR_ASSERT(Program, UR_RESULT_ERROR_INVALID_NULL_POINTER);
auto ZeModule = ur_cast<ze_module_handle_t>(NativeProgram);

// We assume here that programs created from a native handle always
// represent a fully linked executable (state Exe) and not an unlinked
// executable (state Object).
Expand All @@ -966,6 +965,7 @@ ur_result_t urProgramCreateWithNativeHandle(
ur_program_handle_t_::Exe, Context, ZeModule,
Properties ? Properties->isNativeHandleOwned : false);
*Program = reinterpret_cast<ur_program_handle_t>(UrProgram);
(*Program)->IsInteropNativeHandle = true;
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down Expand Up @@ -1036,15 +1036,15 @@ ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
ur_program_handle_t_::ur_program_handle_t_(state, ur_context_handle_t Context,
ze_module_handle_t InteropZeModule)
: Context{Context}, NativeProperties{nullptr}, OwnZeModule{true},
AssociatedDevices({Context->getDevices()[0]}),
InteropZeModule{InteropZeModule} {}
AssociatedDevices({Context->getDevices()[0]}), InteropZeModule{
InteropZeModule} {}

ur_program_handle_t_::ur_program_handle_t_(state, ur_context_handle_t Context,
ze_module_handle_t InteropZeModule,
bool OwnZeModule)
: Context{Context}, NativeProperties{nullptr}, OwnZeModule{OwnZeModule},
AssociatedDevices({Context->getDevices()[0]}),
InteropZeModule{InteropZeModule} {
AssociatedDevices({Context->getDevices()[0]}), InteropZeModule{
InteropZeModule} {
// TODO: Currently it is not possible to understand the device associated
// with provided ZeModule. So we can't set the state on that device to Exe.
}
Expand Down Expand Up @@ -1080,10 +1080,10 @@ void ur_program_handle_t_::ur_release_program_resources(bool deletion) {
if (DeviceData.ZeBuildLog)
ZE_CALL_NOCHECK(zeModuleBuildLogDestroy, (DeviceData.ZeBuildLog));
}

// interop api
if (InteropZeModule && OwnZeModule)
if (InteropZeModule && OwnZeModule && checkL0LoaderTeardown()) {
ZE_CALL_NOCHECK(zeModuleDestroy, (InteropZeModule));
}

for (auto &[ZeDevice, DeviceData] : this->DeviceDataMap)
if (DeviceData.ZeModule)
Expand Down
12 changes: 8 additions & 4 deletions unified-runtime/source/adapters/level_zero/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ ur_result_t urQueueCreateWithNativeHandle(
ur_queue_handle_t_ *Queue = new ur_queue_handle_t_(
ComputeQueues, CopyQueues, Context, UrDevice, OwnNativeHandle, Flags);
*RetQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
(*RetQueue)->IsInteropNativeHandle = true;
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
} catch (...) {
Expand Down Expand Up @@ -1599,10 +1600,13 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
for (auto &QueueGroup : QueueMap)
for (auto &ZeQueue : QueueGroup.second.ZeQueues)
if (ZeQueue) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
if (!Queue->IsInteropNativeHandle ||
(Queue->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(zeCommandQueueDestroy, (ZeQueue));
// Gracefully handle the case that L0 was already unloaded.
if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED)
return ze2urResult(ZeResult);
}
}
}

Expand Down
17 changes: 11 additions & 6 deletions unified-runtime/source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -644,13 +644,18 @@ ur_result_t UR_APICALL urUSMPoolTrimToExp(ur_context_handle_t,
} // namespace ur::level_zero

static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {
auto ZeResult = ZE_CALL_NOCHECK(zeMemFree, (Context->ZeContext, Ptr));
// Handle When the driver is already released
if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
return UR_RESULT_SUCCESS;
} else {
return ze2urResult(ZeResult);
ur_result_t Res = UR_RESULT_SUCCESS;
if (!Context->IsInteropNativeHandle ||
(Context->IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto ZeResult = ZE_CALL_NOCHECK(zeMemFree, (Context->ZeContext, Ptr));
// Handle When the driver is already released
if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
Res = UR_RESULT_SUCCESS;
} else {
Res = ze2urResult(ZeResult);
}
}
return Res;
}

static ur_result_t USMQueryPageSize(ur_context_handle_t Context, void *Ptr,
Expand Down
4 changes: 3 additions & 1 deletion unified-runtime/source/adapters/level_zero/v2/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ struct ze_handle_wrapper {
return;
}

if (ownZeHandle) {
if ((ownZeHandle && !IsInteropNativeHandle) ||
(ownZeHandle && IsInteropNativeHandle && checkL0LoaderTeardown())) {
auto zeResult = destroy(handle);
// Gracefully handle the case that L0 was already unloaded.
if (zeResult && zeResult != ZE_RESULT_ERROR_UNINITIALIZED)
Expand All @@ -102,6 +103,7 @@ struct ze_handle_wrapper {
private:
ZeHandleT handle;
bool ownZeHandle;
bool IsInteropNativeHandle = false;
};

using ze_kernel_handle_t = HANDLE_WRAPPER_TYPE(::ze_kernel_handle_t,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ ur_result_t urContextCreateWithNativeHandle(

*phContext =
new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle);
(*phContext)->IsInteropNativeHandle = true;
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
Expand Down
1 change: 1 addition & 0 deletions unified-runtime/source/adapters/level_zero/v2/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ urEventCreateWithNativeHandle(ur_native_handle_t hNativeEvent,
ZE2UR_CALL(zeEventHostSignal, ((*phEvent)->getZeEvent()));
} else {
*phEvent = new ur_event_handle_t_(hContext, hNativeEvent, pProperties);
(*phEvent)->IsInteropNativeHandle = true;
}
return UR_RESULT_SUCCESS;
} catch (...) {
Expand Down
1 change: 1 addition & 0 deletions unified-runtime/source/adapters/level_zero/v2/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ urKernelCreateWithNativeHandle(ur_native_handle_t hNativeKernel,

*phKernel =
new ur_kernel_handle_t_(hNativeKernel, hProgram, hContext, pProperties);
(*phKernel)->IsInteropNativeHandle = true;
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
Expand Down
Loading