Skip to content

Commit 00c8adb

Browse files
staniewzkikbenzie
authored andcommitted
Port USM alloc to adapter v2 (#18179)
This PR ports USM alloc enqueue API introduced to L0 adapter in intel/llvm#17112 to L0 adapter v2.
1 parent e1bef7f commit 00c8adb

File tree

12 files changed

+271
-51
lines changed

12 files changed

+271
-51
lines changed

source/adapters/level_zero/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
141141
${CMAKE_CURRENT_SOURCE_DIR}/common.cpp
142142
${CMAKE_CURRENT_SOURCE_DIR}/command_buffer_command.cpp
143143
${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
144+
${CMAKE_CURRENT_SOURCE_DIR}/enqueued_pool.cpp
144145
${CMAKE_CURRENT_SOURCE_DIR}/image_common.cpp
145146
${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
146147
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp

source/adapters/level_zero/async_alloc.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ static ur_result_t enqueueUSMAllocHelper(
9393
break;
9494
default:
9595
UR_LOG(ERR, "enqueueUSMAllocHelper: unsupported USM type");
96-
throw UR_RESULT_ERROR_UNKNOWN;
96+
throw UR_RESULT_ERROR_INVALID_ARGUMENT;
9797
}
9898
UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
9999
IsInternal, false));
@@ -247,6 +247,7 @@ ur_result_t urEnqueueUSMFreeExp(
247247
}
248248

249249
size_t size = umfPoolMallocUsableSize(hPool, Mem);
250+
(*Event)->RefCount.increment();
250251
usmPool->AsyncPool.insert(Mem, size, *Event, Queue);
251252

252253
// Signal that USM free event was finished

source/adapters/level_zero/enqueued_pool.cpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,13 @@
99
//===----------------------------------------------------------------------===//
1010

1111
#include "enqueued_pool.hpp"
12-
#include "event.hpp"
1312

1413
#include <ur_api.h>
1514

1615
EnqueuedPool::~EnqueuedPool() { cleanup(); }
1716

1817
std::optional<EnqueuedPool::Allocation>
19-
EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
20-
ur_queue_handle_t Queue) {
18+
EnqueuedPool::getBestFit(size_t Size, size_t Alignment, void *Queue) {
2119
auto Lock = std::lock_guard(Mutex);
2220

2321
Allocation Alloc = {nullptr, Size, nullptr, Queue, Alignment};
@@ -47,12 +45,11 @@ EnqueuedPool::getBestFit(size_t Size, size_t Alignment,
4745
}
4846

4947
void EnqueuedPool::insert(void *Ptr, size_t Size, ur_event_handle_t Event,
50-
ur_queue_handle_t Queue) {
48+
void *Queue) {
5149
auto Lock = std::lock_guard(Mutex);
5250

5351
uintptr_t Address = (uintptr_t)Ptr;
5452
size_t Alignment = Address & (~Address + 1);
55-
Event->RefCount.increment();
5653

5754
Freelist.emplace(Allocation{Ptr, Size, Event, Queue, Alignment});
5855
}
@@ -67,14 +64,15 @@ bool EnqueuedPool::cleanup() {
6764
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It.Ptr);
6865
assert(umfRet == UMF_RESULT_SUCCESS);
6966

70-
urEventReleaseInternal(It.Event);
67+
if (It.Event)
68+
eventRelease(It.Event);
7169
}
7270
Freelist.clear();
7371

7472
return FreedAllocations;
7573
}
7674

77-
bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
75+
bool EnqueuedPool::cleanupForQueue(void *Queue) {
7876
auto Lock = std::lock_guard(Mutex);
7977

8078
Allocation Alloc = {nullptr, 0, nullptr, Queue, 0};
@@ -90,7 +88,8 @@ bool EnqueuedPool::cleanupForQueue(ur_queue_handle_t Queue) {
9088
auto umfRet [[maybe_unused]] = umfPoolFree(hPool, It->Ptr);
9189
assert(umfRet == UMF_RESULT_SUCCESS);
9290

93-
urEventReleaseInternal(It->Event);
91+
if (It->Event)
92+
eventRelease(It->Event);
9493

9594
// Erase the current allocation and move to the next one
9695
It = Freelist.erase(It);

source/adapters/level_zero/enqueued_pool.hpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,24 @@ class EnqueuedPool {
2222
void *Ptr;
2323
size_t Size;
2424
ur_event_handle_t Event;
25-
ur_queue_handle_t Queue;
25+
// Queue handle, used as an identifier for the associated queue.
26+
// This can either be a `ur_queue_handle_t` or a pointer to a v2 queue
27+
// object.
28+
void *Queue;
2629
size_t Alignment;
2730
};
2831

32+
using event_release_callback_t = ur_result_t (*)(ur_event_handle_t);
33+
34+
EnqueuedPool(event_release_callback_t eventRelease)
35+
: eventRelease(eventRelease) {}
36+
2937
~EnqueuedPool();
3038
std::optional<Allocation> getBestFit(size_t Size, size_t Alignment,
31-
ur_queue_handle_t Queue);
32-
void insert(void *Ptr, size_t Size, ur_event_handle_t Event,
33-
ur_queue_handle_t Queue);
39+
void *Queue);
40+
void insert(void *Ptr, size_t Size, ur_event_handle_t Event, void *Queue);
3441
bool cleanup();
35-
bool cleanupForQueue(ur_queue_handle_t Queue);
42+
bool cleanupForQueue(void *Queue);
3643

3744
private:
3845
struct Comparator {
@@ -53,4 +60,5 @@ class EnqueuedPool {
5360
using AllocationSet = std::set<Allocation, Comparator>;
5461
ur_mutex Mutex;
5562
AllocationSet Freelist;
63+
event_release_callback_t eventRelease;
5664
};

source/adapters/level_zero/usm.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "common.hpp"
1313

1414
#include "enqueued_pool.hpp"
15+
#include "event.hpp"
1516
#include "ur_api.h"
1617
#include "ur_pool_manager.hpp"
1718
#include <set>
@@ -20,7 +21,10 @@
2021
usm::DisjointPoolAllConfigs InitializeDisjointPoolConfig();
2122

2223
struct UsmPool {
23-
UsmPool(umf::pool_unique_handle_t Pool) : UmfPool(std::move(Pool)) {}
24+
UsmPool(umf::pool_unique_handle_t Pool)
25+
: UmfPool(std::move(Pool)), AsyncPool([](ur_event_handle_t Event) {
26+
return urEventReleaseInternal(Event);
27+
}) {}
2428
umf::pool_unique_handle_t UmfPool;
2529
// 'AsyncPool' needs to be declared after 'UmfPool' so its destructor is
2630
// invoked first.

source/adapters/level_zero/v2/context.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext,
7777
v2::EVENT_FLAGS_PROFILING_ENABLED)),
7878
p2pAccessDevices(populateP2PDevices(
7979
phDevices[0]->Platform->getNumDevices(), this->hDevices)),
80-
defaultUSMPool(this, nullptr) {}
80+
defaultUSMPool(this, nullptr), asyncPool(this, nullptr) {}
8181

8282
ur_result_t ur_context_handle_t_::retain() {
8383
RefCount.increment();
@@ -114,6 +114,8 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() {
114114
return &defaultUSMPool;
115115
}
116116

117+
ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { return &asyncPool; }
118+
117119
const std::vector<ur_device_handle_t> &
118120
ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const {
119121
return p2pAccessDevices[hDevice->Id.value()];

source/adapters/level_zero/v2/context.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ struct ur_context_handle_t_ : ur_object {
3131

3232
const std::vector<ur_device_handle_t> &getDevices() const;
3333
ur_usm_pool_handle_t getDefaultUSMPool();
34+
ur_usm_pool_handle_t getAsyncPool();
3435

3536
const std::vector<ur_device_handle_t> &
3637
getP2PDevices(ur_device_handle_t hDevice) const;
@@ -67,4 +68,5 @@ struct ur_context_handle_t_ : ur_object {
6768
const std::vector<std::vector<ur_device_handle_t>> p2pAccessDevices;
6869

6970
ur_usm_pool_handle_t_ defaultUSMPool;
71+
ur_usm_pool_handle_t_ asyncPool;
7072
};

source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 142 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
159159
ZE2UR_CALL(zeCommandListHostSynchronize,
160160
(commandListLocked->getZeCommandList(), UINT64_MAX));
161161

162+
hContext->getAsyncPool()->cleanupPoolsForQueue(this);
163+
162164
// Free deferred kernels
163165
for (auto &hKernel : submittedKernels) {
164166
UR_CALL(hKernel->release());
@@ -706,31 +708,155 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
706708
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
707709
}
708710

711+
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper(
712+
ur_usm_pool_handle_t pPool, const size_t size,
713+
const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
714+
const ur_event_handle_t *phEventWaitList, void **ppMem,
715+
ur_event_handle_t *phEvent, ur_usm_type_t type) {
716+
auto commandListLocked = commandListManager.lock();
717+
718+
if (!pPool) {
719+
pPool = hContext->getAsyncPool();
720+
}
721+
722+
auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;
723+
724+
ur_event_handle_t originAllocEvent = nullptr;
725+
auto asyncAlloc = pPool->allocateEnqueued(hContext, this, true, device,
726+
nullptr, type, size);
727+
if (!asyncAlloc) {
728+
auto Ret = pPool->allocate(hContext, device, nullptr, type, size, ppMem);
729+
if (Ret) {
730+
return Ret;
731+
}
732+
} else {
733+
std::tie(*ppMem, originAllocEvent) = *asyncAlloc;
734+
}
735+
736+
auto waitListView = getWaitListView(commandListLocked, phEventWaitList,
737+
numEventsInWaitList, originAllocEvent);
738+
739+
ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
740+
switch (type) {
741+
case UR_USM_TYPE_HOST:
742+
commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
743+
break;
744+
case UR_USM_TYPE_DEVICE:
745+
commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
746+
break;
747+
case UR_USM_TYPE_SHARED:
748+
commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
749+
break;
750+
default:
751+
UR_LOG(ERR, "enqueueUSMAllocHelper: unsupported USM type");
752+
throw UR_RESULT_ERROR_INVALID_ARGUMENT;
753+
}
754+
755+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, commandType);
756+
auto [pWaitEvents, numWaitEvents] = waitListView;
757+
758+
if (numWaitEvents > 0) {
759+
ZE2UR_CALL(
760+
zeCommandListAppendWaitOnEvents,
761+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
762+
}
763+
if (zeSignalEvent) {
764+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
765+
(commandListLocked->getZeCommandList(), zeSignalEvent));
766+
}
767+
if (originAllocEvent) {
768+
originAllocEvent->release();
769+
}
770+
771+
return UR_RESULT_SUCCESS;
772+
}
773+
709774
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp(
710-
ur_usm_pool_handle_t, const size_t,
711-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
712-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
713-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
775+
ur_usm_pool_handle_t pPool, const size_t size,
776+
const ur_exp_async_usm_alloc_properties_t *pProperties,
777+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
778+
void **ppMem, ur_event_handle_t *phEvent) {
779+
TRACK_SCOPE_LATENCY(
780+
"ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp");
781+
782+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
783+
phEventWaitList, ppMem, phEvent,
784+
UR_USM_TYPE_DEVICE);
714785
}
715786

716787
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp(
717-
ur_usm_pool_handle_t, const size_t,
718-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
719-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
720-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
788+
ur_usm_pool_handle_t pPool, const size_t size,
789+
const ur_exp_async_usm_alloc_properties_t *pProperties,
790+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
791+
void **ppMem, ur_event_handle_t *phEvent) {
792+
TRACK_SCOPE_LATENCY(
793+
"ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp");
794+
795+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
796+
phEventWaitList, ppMem, phEvent,
797+
UR_USM_TYPE_SHARED);
721798
}
722799

723800
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp(
724-
ur_usm_pool_handle_t, const size_t,
725-
const ur_exp_async_usm_alloc_properties_t *, uint32_t,
726-
const ur_event_handle_t *, void **, ur_event_handle_t *) {
727-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
801+
ur_usm_pool_handle_t pPool, const size_t size,
802+
const ur_exp_async_usm_alloc_properties_t *pProperties,
803+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
804+
void **ppMem, ur_event_handle_t *phEvent) {
805+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp");
806+
807+
return enqueueUSMAllocHelper(pPool, size, pProperties, numEventsInWaitList,
808+
phEventWaitList, ppMem, phEvent,
809+
UR_USM_TYPE_HOST);
728810
}
729811

730812
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp(
731-
ur_usm_pool_handle_t, void *, uint32_t, const ur_event_handle_t *,
732-
ur_event_handle_t *) {
733-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
813+
ur_usm_pool_handle_t, void *pMem, uint32_t numEventsInWaitList,
814+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
815+
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFreeExp");
816+
auto commandListLocked = commandListManager.lock();
817+
ur_event_handle_t internalEvent = nullptr;
818+
if (phEvent == nullptr) {
819+
phEvent = &internalEvent;
820+
}
821+
822+
auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent,
823+
UR_COMMAND_ENQUEUE_USM_FREE_EXP);
824+
auto [pWaitEvents, numWaitEvents] =
825+
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList);
826+
827+
umf_memory_pool_handle_t hPool = umfPoolByPtr(pMem);
828+
if (!hPool) {
829+
return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
830+
}
831+
832+
UsmPool *usmPool = nullptr;
833+
auto ret = umfPoolGetTag(hPool, (void **)&usmPool);
834+
if (ret != UMF_RESULT_SUCCESS || !usmPool) {
835+
// This should never happen
836+
UR_LOG(ERR, "enqueueUSMFreeExp: invalid pool tag");
837+
return UR_RESULT_ERROR_UNKNOWN;
838+
}
839+
840+
size_t size = umfPoolMallocUsableSize(hPool, pMem);
841+
if (internalEvent == nullptr) {
842+
// When the output event is used instead of an internal event, we need to
843+
// increment the refcount.
844+
(*phEvent)->RefCount.increment();
845+
}
846+
847+
if (numWaitEvents > 0) {
848+
ZE2UR_CALL(
849+
zeCommandListAppendWaitOnEvents,
850+
(commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents));
851+
}
852+
853+
ZE2UR_CALL(zeCommandListAppendSignalEvent,
854+
(commandListLocked->getZeCommandList(), zeSignalEvent));
855+
856+
// Insert must be done after the signal event is appended.
857+
usmPool->asyncPool.insert(pMem, size, *phEvent, this);
858+
859+
return UR_RESULT_SUCCESS;
734860
}
735861

736862
ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp(
@@ -881,9 +1007,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
8811007
"ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp");
8821008

8831009
auto commandListLocked = commandListManager.lock();
1010+
8841011
auto zeSignalEvent =
8851012
getSignalEvent(commandListLocked, phEvent, callerCommand);
886-
8871013
auto [pWaitEvents, numWaitEvents] =
8881014
getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList,
8891015
additionalWaitEvent);

source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ {
6464

6565
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
6666

67+
ur_result_t
68+
enqueueUSMAllocHelper(ur_usm_pool_handle_t pPool, const size_t size,
69+
const ur_exp_async_usm_alloc_properties_t *pProperties,
70+
uint32_t numEventsInWaitList,
71+
const ur_event_handle_t *phEventWaitList, void **ppMem,
72+
ur_event_handle_t *phEvent, ur_usm_type_t Type);
73+
6774
public:
6875
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
6976
const ur_queue_properties_t *);

0 commit comments

Comments
 (0)