Skip to content

Commit 3a3bbe1

Browse files
kswiecickistaniewzkipbalcer
authored
[UR][L0] Add initial USM alloc enqueue API (#17112)
Co-authored-by: Michał Staniewski <michal.staniewski@intel.com> Co-authored-by: Piotr Balcer <piotr.balcer@intel.com>
1 parent 6d25c1e commit 3a3bbe1

File tree

16 files changed

+1431
-64
lines changed

16 files changed

+1431
-64
lines changed

unified-runtime/source/adapters/level_zero/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ if(UR_BUILD_ADAPTER_L0)
5252
${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp
5353
${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp
5454
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
55+
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
5556
)
5657
install_ur_library(ur_adapter_level_zero)
5758

Lines changed: 239 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,259 @@
1-
//===--------- async_alloc.cpp - CUDA Adapter -----------------------------===//
1+
//===--------- async_alloc.cpp - Level Zero Adapter -----------------------===//
22
//
3-
// Copyright (C) 2024 Intel Corporation
3+
// Copyright (C) 2025 Intel Corporation
44
//
55
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
66
// Exceptions. See LICENSE.TXT
77
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
88
//
99
//===----------------------------------------------------------------------===//
1010

11+
#include "context.hpp"
12+
#include "enqueued_pool.hpp"
13+
#include "event.hpp"
14+
15+
#include "logger/ur_logger.hpp"
16+
17+
#include <umf_helpers.hpp>
1118
#include <ur_api.h>
1219

1320
namespace ur::level_zero {
1421

15-
UR_APIEXPORT ur_result_t urEnqueueUSMDeviceAllocExp(
16-
ur_queue_handle_t, ur_usm_pool_handle_t, const size_t,
17-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
18-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
19-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
22+
static ur_result_t enqueueUSMAllocHelper(
23+
ur_queue_handle_t Queue, ur_usm_pool_handle_t Pool, const size_t Size,
24+
const ur_exp_async_usm_alloc_properties_t *, uint32_t NumEventsInWaitList,
25+
const ur_event_handle_t *EventWaitList, void **RetMem,
26+
ur_event_handle_t *OutEvent, ur_usm_type_t Type) {
27+
28+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
29+
30+
// Allocate USM memory
31+
ur_usm_pool_handle_t USMPool = nullptr;
32+
if (Pool) {
33+
USMPool = Pool;
34+
} else {
35+
USMPool = &Queue->Context->AsyncPool;
36+
}
37+
38+
auto Device = (Type == UR_USM_TYPE_HOST) ? nullptr : Queue->Device;
39+
40+
std::vector<ur_event_handle_t> ExtEventWaitList;
41+
ur_event_handle_t OriginAllocEvent = nullptr;
42+
auto AsyncAlloc =
43+
USMPool->allocateEnqueued(Queue, Device, nullptr, Type, Size);
44+
if (!AsyncAlloc) {
45+
auto Ret =
46+
USMPool->allocate(Queue->Context, Device, nullptr, Type, Size, RetMem);
47+
if (Ret) {
48+
return Ret;
49+
}
50+
} else {
51+
*RetMem = std::get<0>(*AsyncAlloc);
52+
OriginAllocEvent = std::get<1>(*AsyncAlloc);
53+
if (OriginAllocEvent) {
54+
for (size_t I = 0; I < NumEventsInWaitList; ++I) {
55+
ExtEventWaitList.push_back(EventWaitList[I]);
56+
}
57+
ExtEventWaitList.push_back(OriginAllocEvent);
58+
}
59+
}
60+
61+
if (!ExtEventWaitList.empty()) {
62+
NumEventsInWaitList = ExtEventWaitList.size();
63+
EventWaitList = ExtEventWaitList.data();
64+
}
65+
66+
bool UseCopyEngine = false;
67+
_ur_ze_event_list_t TmpWaitList;
68+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
69+
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
70+
71+
bool OkToBatch = true;
72+
// Get a new command list to be used on this call
73+
ur_command_list_ptr_t CommandList{};
74+
UR_CALL(Queue->Context->getAvailableCommandList(
75+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
76+
OkToBatch, nullptr /*ForcedCmdQueue*/));
77+
78+
ze_event_handle_t ZeEvent = nullptr;
79+
ur_event_handle_t InternalEvent{};
80+
bool IsInternal = OutEvent == nullptr;
81+
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
82+
83+
ur_command_t CommandType = UR_COMMAND_FORCE_UINT32;
84+
switch (Type) {
85+
case UR_USM_TYPE_HOST:
86+
CommandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
87+
break;
88+
case UR_USM_TYPE_DEVICE:
89+
CommandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
90+
break;
91+
case UR_USM_TYPE_SHARED:
92+
CommandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
93+
break;
94+
default:
95+
logger::error("enqueueUSMAllocHelper: unsupported USM type");
96+
throw UR_RESULT_ERROR_UNKNOWN;
97+
}
98+
UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
99+
IsInternal, false));
100+
ZeEvent = (*Event)->ZeEvent;
101+
(*Event)->WaitList = TmpWaitList;
102+
(*Event)->OriginAllocEvent = OriginAllocEvent;
103+
104+
const auto &ZeCommandList = CommandList->first;
105+
const auto &WaitList = (*Event)->WaitList;
106+
if (WaitList.Length) {
107+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
108+
(ZeCommandList, WaitList.Length, WaitList.ZeEventList));
109+
}
110+
111+
// Signal that USM allocation event was finished
112+
ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent));
113+
114+
UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch));
115+
116+
return UR_RESULT_SUCCESS;
117+
}
118+
119+
ur_result_t urEnqueueUSMDeviceAllocExp(
120+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
121+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
122+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
123+
///< to be allocated
124+
const ur_exp_async_usm_alloc_properties_t
125+
*Properties, ///< [in][optional] pointer to the enqueue asynchronous
126+
///< USM allocation properties
127+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
128+
const ur_event_handle_t
129+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
130+
///< pointer to a list of events that must be complete
131+
///< before the kernel execution. If nullptr, the
132+
///< numEventsInWaitList must be 0, indicating no wait
133+
///< events.
134+
void **Mem, ///< [out] pointer to USM memory object
135+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
136+
///< identifies the async alloc
137+
) {
138+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
139+
NumEventsInWaitList, EventWaitList, Mem,
140+
OutEvent, UR_USM_TYPE_DEVICE);
20141
}
21142

22-
UR_APIEXPORT ur_result_t urEnqueueUSMSharedAllocExp(
23-
ur_queue_handle_t, ur_usm_pool_handle_t, const size_t,
24-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
25-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
26-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
143+
ur_result_t urEnqueueUSMSharedAllocExp(
144+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
145+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
146+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
147+
///< to be allocated
148+
const ur_exp_async_usm_alloc_properties_t
149+
*Properties, ///< [in][optional] pointer to the enqueue asynchronous
150+
///< USM allocation properties
151+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
152+
const ur_event_handle_t
153+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
154+
///< pointer to a list of events that must be complete
155+
///< before the kernel execution. If nullptr, the
156+
///< numEventsInWaitList must be 0, indicating no wait
157+
///< events.
158+
void **Mem, ///< [out] pointer to USM memory object
159+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
160+
///< identifies the async alloc
161+
) {
162+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
163+
NumEventsInWaitList, EventWaitList, Mem,
164+
OutEvent, UR_USM_TYPE_SHARED);
27165
}
28166

29-
UR_APIEXPORT ur_result_t urEnqueueUSMHostAllocExp(
30-
ur_queue_handle_t, ur_usm_pool_handle_t, const size_t,
31-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
32-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
33-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
167+
ur_result_t urEnqueueUSMHostAllocExp(
168+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
169+
ur_usm_pool_handle_t Pool, ///< [in][optional] handle of the USM memory pool
170+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
171+
///< to be allocated
172+
const ur_exp_async_usm_alloc_properties_t
173+
*Properties, ///< [in][optional] pointer to the enqueue asynchronous
174+
///< USM allocation properties
175+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
176+
const ur_event_handle_t
177+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
178+
///< pointer to a list of events that must be complete
179+
///< before the kernel execution. If nullptr, the
180+
///< numEventsInWaitList must be 0, indicating no wait
181+
///< events.
182+
void **Mem, ///< [out] pointer to USM memory object
183+
ur_event_handle_t
184+
*OutEvent ///< [out][optional] return an event object that identifies
185+
///< the asynchronous USM device allocation
186+
) {
187+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
188+
NumEventsInWaitList, EventWaitList, Mem,
189+
OutEvent, UR_USM_TYPE_HOST);
34190
}
35191

36-
UR_APIEXPORT ur_result_t urEnqueueUSMFreeExp(ur_queue_handle_t,
37-
ur_usm_pool_handle_t, void *,
38-
uint32_t,
39-
const ur_event_handle_t *,
40-
ur_event_handle_t *) {
41-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
192+
ur_result_t urEnqueueUSMFreeExp(
193+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
194+
ur_usm_pool_handle_t, ///< [in][optional] USM pool descriptor
195+
void *Mem, ///< [in] pointer to USM memory object
196+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
197+
const ur_event_handle_t
198+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
199+
///< pointer to a list of events that must be complete
200+
///< before the kernel execution. If nullptr, the
201+
///< numEventsInWaitList must be 0, indicating no wait
202+
///< events.
203+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
204+
///< identifies the async alloc
205+
) {
206+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
207+
208+
bool UseCopyEngine = false;
209+
_ur_ze_event_list_t TmpWaitList;
210+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
211+
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
212+
213+
bool OkToBatch = false;
214+
// Get a new command list to be used on this call
215+
ur_command_list_ptr_t CommandList{};
216+
UR_CALL(Queue->Context->getAvailableCommandList(
217+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
218+
OkToBatch, nullptr /*ForcedCmdQueue*/));
219+
220+
ze_event_handle_t ZeEvent = nullptr;
221+
ur_event_handle_t InternalEvent{};
222+
bool IsInternal = OutEvent == nullptr;
223+
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
224+
225+
UR_CALL(createEventAndAssociateQueue(Queue, Event,
226+
UR_COMMAND_ENQUEUE_USM_FREE_EXP,
227+
CommandList, IsInternal, false));
228+
ZeEvent = (*Event)->ZeEvent;
229+
(*Event)->WaitList = TmpWaitList;
230+
231+
const auto &ZeCommandList = CommandList->first;
232+
const auto &WaitList = (*Event)->WaitList;
233+
if (WaitList.Length) {
234+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
235+
(ZeCommandList, WaitList.Length, WaitList.ZeEventList));
236+
}
237+
238+
auto hPool = umfPoolByPtr(Mem);
239+
if (!hPool) {
240+
return USMFreeHelper(Queue->Context, Mem);
241+
}
242+
243+
UsmPool *usmPool = nullptr;
244+
auto ret = umfPoolGetTag(hPool, (void **)&usmPool);
245+
if (ret != UMF_RESULT_SUCCESS || usmPool == nullptr) {
246+
return USMFreeHelper(Queue->Context, Mem);
247+
}
248+
249+
size_t size = umfPoolMallocUsableSize(hPool, Mem);
250+
usmPool->AsyncPool.insert(Mem, size, *Event, Queue);
251+
252+
// Signal that USM free event was finished
253+
ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
254+
255+
UR_CALL(Queue->executeCommandList(CommandList, false, OkToBatch));
256+
257+
return UR_RESULT_SUCCESS;
42258
}
43259
} // namespace ur::level_zero

unified-runtime/source/adapters/level_zero/context.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <mutex>
1414
#include <string.h>
1515

16+
#include "adapters/level_zero/usm.hpp"
1617
#include "context.hpp"
1718
#include "logger/ur_logger.hpp"
1819
#include "queue.hpp"
@@ -295,6 +296,8 @@ ur_result_t ur_context_handle_t_::finalize() {
295296
// urContextRelease. There could be some memory that may have not been
296297
// deallocated. For example, event and event pool caches would be still alive.
297298

299+
AsyncPool.cleanupPools();
300+
298301
if (!DisableEventsCaching) {
299302
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
300303
for (auto &EventCache : EventCaches) {

unified-runtime/source/adapters/level_zero/context.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,14 @@ struct ur_context_handle_t_ : _ur_object {
5858
ur_context_handle_t_(ze_context_handle_t ZeContext, uint32_t NumDevices,
5959
const ur_device_handle_t *Devs, bool OwnZeContext)
6060
: ZeContext{ZeContext}, Devices{Devs, Devs + NumDevices},
61-
NumDevices{NumDevices}, DefaultPool{this, nullptr, !UseUSMAllocator} {
61+
NumDevices{NumDevices}, DefaultPool{this, nullptr, !UseUSMAllocator},
62+
AsyncPool{this, nullptr, !UseUSMAllocator} {
6263
OwnNativeHandle = OwnZeContext;
6364
}
6465

6566
ur_context_handle_t_(ze_context_handle_t ZeContext)
66-
: ZeContext{ZeContext}, DefaultPool{this, nullptr, !UseUSMAllocator} {}
67+
: ZeContext{ZeContext}, DefaultPool{this, nullptr, !UseUSMAllocator},
68+
AsyncPool{this, nullptr, !UseUSMAllocator} {}
6769

6870
// A L0 context handle is primarily used during creation and management of
6971
// resources that may be used by multiple devices.
@@ -126,6 +128,9 @@ struct ur_context_handle_t_ : _ur_object {
126128
// the 'UseUSMAllocator' variable value.
127129
ur_usm_pool_handle_t_ DefaultPool;
128130

131+
// USM pools for async allocations.
132+
ur_usm_pool_handle_t_ AsyncPool;
133+
129134
// Map associating pools created with urUsmPoolCreate and internal pools
130135
std::list<ur_usm_pool_handle_t> UsmPoolHandles{};
131136

0 commit comments

Comments
 (0)