Skip to content

Commit 769d3b9

Browse files
author
Ilya Stepykin
committed
[SYCL] USM shared memory allocator for L0 plugin
In L0 each allocation results in at least a memory page no matter how small the size was requested. This adds significant overhead when an app does many small allocations. This patch adds a memory allocator on top of L0 USM API in order to solve the problem. High level description: There is a predefine list of bucket sizes. When allocation function is called, the best fitted bucket is found for the requested size. If there is a free chunk in the bucket then it's returned, otherwise a new slab of size 64k requested from the system via L0 API. This slab is split into chunks of the size of corresponding bucket. Now there is a free chunk in the slab which is returned. When a chunk is returned it's marked as non-free until the user call free with a corresponding pointer. In addition to that add an environment variable SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR which disables the allocator and returns back to the original behavior. Signed-off-by: Ilya Stepykin <ilya.stepykin@intel.com>
1 parent 31843cc commit 769d3b9

File tree

9 files changed

+1137
-13
lines changed

9 files changed

+1137
-13
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ subject to change. Do not rely on these variables in production code.
2626
| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
2727
| SYCL_DEVICELIB_NO_FALLBACK | Any(\*) | Disable loading and linking of device library images |
2828
| SYCL_PI_LEVEL0_MAX_COMMAND_LIST_CACHE | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
29+
| SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR | Any(\*) | Disable USM allocator in L0 plugin(each memory request will go directly to L0 runtine) |
2930

3031
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
3132

sycl/plugins/level_zero/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ add_library(pi_level_zero SHARED
7575
"${sycl_inc_dir}/CL/sycl/detail/pi.h"
7676
"${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.cpp"
7777
"${CMAKE_CURRENT_SOURCE_DIR}/pi_level_zero.hpp"
78+
"${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.cpp"
79+
"${CMAKE_CURRENT_SOURCE_DIR}/usm_allocator.hpp"
7880
)
7981

8082
if (MSVC)

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 183 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222

2323
#include <level_zero/zet_api.h>
2424

25+
#include "usm_allocator.hpp"
26+
2527
namespace {
2628

2729
// Controls Level Zero calls serialization to w/a Level Zero driver being not MT
@@ -1491,10 +1493,16 @@ pi_result piContextRelease(pi_context Context) {
14911493

14921494
assert(Context);
14931495
if (--(Context->RefCount) == 0) {
1496+
auto ZeContext = Context->ZeContext;
14941497
// Destroy the command list used for initializations
14951498
ZE_CALL(zeCommandListDestroy(Context->ZeCommandListInit));
1496-
ZE_CALL(zeContextDestroy(Context->ZeContext));
14971499
delete Context;
1500+
1501+
// Destruction of some members of pi_context uses L0 context
1502+
// and therefore it must be valid at that point.
1503+
// Technically it should be placed to the destructor of pi_context
1504+
// but this makes API error handling more complex.
1505+
ZE_CALL(zeContextDestroy(ZeContext));
14981506
}
14991507
return PI_SUCCESS;
15001508
}
@@ -4052,7 +4060,6 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program,
40524060
pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
40534061
pi_usm_mem_properties *Properties, size_t Size,
40544062
pi_uint32 Alignment) {
4055-
40564063
assert(Context);
40574064
// Check that incorrect bits are not set in the properties.
40584065
assert(!Properties || (Properties && !(*Properties & ~PI_MEM_ALLOC_FLAGS)));
@@ -4066,11 +4073,17 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context,
40664073
return PI_SUCCESS;
40674074
}
40684075

4069-
pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
4070-
pi_device Device,
4071-
pi_usm_mem_properties *Properties, size_t Size,
4072-
pi_uint32 Alignment) {
4076+
static bool ShouldUseUSMAllocator() {
4077+
// Enable allocator by default if it's not explicitly disabled
4078+
return std::getenv("SYCL_PI_LEVEL0_DISABLE_USM_ALLOCATOR") == nullptr;
4079+
}
4080+
4081+
static const bool UseUSMAllocator = ShouldUseUSMAllocator();
40734082

4083+
pi_result USMDeviceAllocImpl(void **ResultPtr, pi_context Context,
4084+
pi_device Device,
4085+
pi_usm_mem_properties *Properties, size_t Size,
4086+
pi_uint32 Alignment) {
40744087
assert(Context);
40754088
assert(Device);
40764089
// Check that incorrect bits are not set in the properties.
@@ -4086,11 +4099,10 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
40864099
return PI_SUCCESS;
40874100
}
40884101

4089-
pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
4090-
pi_device Device,
4091-
pi_usm_mem_properties *Properties, size_t Size,
4092-
pi_uint32 Alignment) {
4093-
4102+
pi_result USMSharedAllocImpl(void **ResultPtr, pi_context Context,
4103+
pi_device Device,
4104+
pi_usm_mem_properties *Properties, size_t Size,
4105+
pi_uint32 Alignment) {
40944106
assert(Context);
40954107
assert(Device);
40964108
// Check that incorrect bits are not set in the properties.
@@ -4108,11 +4120,170 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
41084120
return PI_SUCCESS;
41094121
}
41104122

4111-
pi_result piextUSMFree(pi_context Context, void *Ptr) {
4123+
pi_result USMFreeImpl(pi_context Context, void *Ptr) {
41124124
ZE_CALL(zeMemFree(Context->ZeContext, Ptr));
41134125
return PI_SUCCESS;
41144126
}
41154127

4128+
// Exception type to pass allocation errors
4129+
class UsmAllocationException {
4130+
const pi_result Error;
4131+
4132+
public:
4133+
UsmAllocationException(pi_result Err) : Error{Err} {}
4134+
pi_result getError() const { return Error; }
4135+
};
4136+
4137+
pi_result USMSharedMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
4138+
pi_uint32 Alignment) {
4139+
return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, Size,
4140+
Alignment);
4141+
}
4142+
4143+
pi_result USMDeviceMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
4144+
pi_uint32 Alignment) {
4145+
return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
4146+
Alignment);
4147+
}
4148+
4149+
void *USMMemoryAllocBase::allocate(size_t Size) {
4150+
void *Ptr = nullptr;
4151+
4152+
auto Res = allocateImpl(&Ptr, Size, sizeof(void *));
4153+
if (Res != PI_SUCCESS) {
4154+
throw UsmAllocationException(Res);
4155+
}
4156+
4157+
return Ptr;
4158+
}
4159+
4160+
void *USMMemoryAllocBase::allocate(size_t Size, size_t Alignment) {
4161+
void *Ptr = nullptr;
4162+
4163+
auto Res = allocateImpl(&Ptr, Size, Alignment);
4164+
if (Res != PI_SUCCESS) {
4165+
throw UsmAllocationException(Res);
4166+
}
4167+
return Ptr;
4168+
}
4169+
4170+
void USMMemoryAllocBase::deallocate(void *Ptr) {
4171+
auto Res = USMFreeImpl(Context, Ptr);
4172+
if (Res != PI_SUCCESS) {
4173+
throw UsmAllocationException(Res);
4174+
}
4175+
}
4176+
4177+
pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
4178+
pi_device Device,
4179+
pi_usm_mem_properties *Properties, size_t Size,
4180+
pi_uint32 Alignment) {
4181+
if (!UseUSMAllocator ||
4182+
// L0 spec says that allocation fails if Alignment != 2^n, in order to
4183+
// keep the same behavior for the allocator, just call L0 API directly and
4184+
// return the error code.
4185+
((Alignment & (Alignment - 1)) != 0)) {
4186+
return USMDeviceAllocImpl(ResultPtr, Context, Device, Properties, Size,
4187+
Alignment);
4188+
}
4189+
4190+
try {
4191+
auto It = Context->DeviceMemAllocContexts.find(Device);
4192+
if (It == Context->DeviceMemAllocContexts.end())
4193+
return PI_INVALID_VALUE;
4194+
4195+
*ResultPtr = It->second.allocate(Size, Alignment);
4196+
} catch (const UsmAllocationException &Ex) {
4197+
*ResultPtr = nullptr;
4198+
return Ex.getError();
4199+
}
4200+
4201+
return PI_SUCCESS;
4202+
}
4203+
4204+
pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context,
4205+
pi_device Device,
4206+
pi_usm_mem_properties *Properties, size_t Size,
4207+
pi_uint32 Alignment) {
4208+
if (!UseUSMAllocator ||
4209+
// L0 spec says that allocation fails if Alignment != 2^n, in order to
4210+
// keep the same behavior for the allocator, just call L0 API directly and
4211+
// return the error code.
4212+
((Alignment & (Alignment - 1)) != 0)) {
4213+
return USMSharedAllocImpl(ResultPtr, Context, Device, Properties, Size,
4214+
Alignment);
4215+
}
4216+
4217+
try {
4218+
auto It = Context->SharedMemAllocContexts.find(Device);
4219+
if (It == Context->SharedMemAllocContexts.end())
4220+
return PI_INVALID_VALUE;
4221+
4222+
*ResultPtr = It->second.allocate(Size, Alignment);
4223+
} catch (const UsmAllocationException &Ex) {
4224+
*ResultPtr = nullptr;
4225+
return Ex.getError();
4226+
}
4227+
4228+
return PI_SUCCESS;
4229+
}
4230+
4231+
pi_result piextUSMFree(pi_context Context, void *Ptr) {
4232+
if (!UseUSMAllocator) {
4233+
return USMFreeImpl(Context, Ptr);
4234+
}
4235+
4236+
// Query the device of the allocation to determine the right allocator context
4237+
ze_device_handle_t ZeDeviceHandle;
4238+
ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
4239+
4240+
// Query memory type of the pointer we're freeing to determine the correct
4241+
// way to do it(directly or via the allocator)
4242+
ZE_CALL(zeMemGetAllocProperties(
4243+
Context->ZeContext, Ptr, &ZeMemoryAllocationProperties, &ZeDeviceHandle));
4244+
4245+
// TODO: when support for multiple devices is implemented, here
4246+
// we should do the following:
4247+
// - Find pi_device instance corresponding to ZeDeviceHandle we've just got if
4248+
// exist
4249+
// - Use that pi_device to find the right allocator context and free the
4250+
// pointer.
4251+
4252+
// The allocation doesn't belong to any device for which USM allocator is
4253+
// enabled.
4254+
if (Context->Device->ZeDevice != ZeDeviceHandle) {
4255+
return USMFreeImpl(Context, Ptr);
4256+
}
4257+
4258+
auto DeallocationHelper =
4259+
[Context,
4260+
Ptr](std::unordered_map<pi_device, USMAllocContext> &AllocContextMap) {
4261+
try {
4262+
auto It = AllocContextMap.find(Context->Device);
4263+
if (It == AllocContextMap.end())
4264+
return PI_INVALID_VALUE;
4265+
4266+
// The right context is found, deallocate the pointer
4267+
It->second.deallocate(Ptr);
4268+
} catch (const UsmAllocationException &Ex) {
4269+
return Ex.getError();
4270+
}
4271+
4272+
return PI_SUCCESS;
4273+
};
4274+
4275+
switch (ZeMemoryAllocationProperties.type) {
4276+
case ZE_MEMORY_TYPE_SHARED:
4277+
return DeallocationHelper(Context->SharedMemAllocContexts);
4278+
case ZE_MEMORY_TYPE_DEVICE:
4279+
return DeallocationHelper(Context->DeviceMemAllocContexts);
4280+
default:
4281+
// Handled below
4282+
break;
4283+
}
4284+
return USMFreeImpl(Context, Ptr);
4285+
}
4286+
41164287
pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
41174288
size_t ArgSize, const void *ArgValue) {
41184289

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232

3333
#include <level_zero/ze_api.h>
3434

35+
#include "usm_allocator.hpp"
36+
3537
template <class To, class From> To pi_cast(From Value) {
3638
// TODO: see if more sanity checks are possible.
3739
assert(sizeof(From) == sizeof(To));
@@ -89,6 +91,46 @@ struct _pi_platform {
8991
std::atomic<int> ZeGlobalCommandListCount{0};
9092
};
9193

94+
// Implements memory allocation via L0 RT for USM allocator interface.
95+
class USMMemoryAllocBase : public SystemMemory {
96+
protected:
97+
pi_context Context;
98+
pi_device Device;
99+
// Internal allocation routine which must be implemented for each allocation
100+
// type
101+
virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
102+
pi_uint32 Alignment) = 0;
103+
104+
public:
105+
USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
106+
: Context{Ctx}, Device{Dev} {}
107+
void *allocate(size_t Size) override final;
108+
void *allocate(size_t Size, size_t Alignment) override final;
109+
void deallocate(void *Ptr) override final;
110+
};
111+
112+
// Allocation routines for shared memory type
113+
class USMSharedMemoryAlloc : public USMMemoryAllocBase {
114+
protected:
115+
pi_result allocateImpl(void **ResultPtr, size_t Size,
116+
pi_uint32 Alignment) override;
117+
118+
public:
119+
USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
120+
: USMMemoryAllocBase(Ctx, Dev) {}
121+
};
122+
123+
// Allocation routines for device memory type
124+
class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
125+
protected:
126+
pi_result allocateImpl(void **ResultPtr, size_t Size,
127+
pi_uint32 Alignment) override;
128+
129+
public:
130+
USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
131+
: USMMemoryAllocBase(Ctx, Dev) {}
132+
};
133+
92134
struct _pi_device : _pi_object {
93135
_pi_device(ze_device_handle_t Device, pi_platform Plt,
94136
bool isSubDevice = false)
@@ -145,7 +187,19 @@ struct _pi_device : _pi_object {
145187
struct _pi_context : _pi_object {
146188
_pi_context(pi_device Device)
147189
: Device{Device}, ZeCommandListInit{nullptr}, ZeEventPool{nullptr},
148-
NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {}
190+
NumEventsAvailableInEventPool{}, NumEventsLiveInEventPool{} {
191+
// TODO: when support for multiple devices is added, here we should
192+
// loop over all the devices and initialize allocator context for each
193+
// pair (context, device)
194+
SharedMemAllocContexts.emplace(
195+
std::piecewise_construct, std::make_tuple(Device),
196+
std::make_tuple(std::unique_ptr<SystemMemory>(
197+
new USMSharedMemoryAlloc(this, Device))));
198+
DeviceMemAllocContexts.emplace(
199+
std::piecewise_construct, std::make_tuple(Device),
200+
std::make_tuple(std::unique_ptr<SystemMemory>(
201+
new USMDeviceMemoryAlloc(this, Device))));
202+
}
149203

150204
// A L0 context handle is primarily used during creation and management of
151205
// resources that may be used by multiple devices.
@@ -174,6 +228,12 @@ struct _pi_context : _pi_object {
174228
// and destroy the pool if there are no alive events.
175229
ze_result_t decrementAliveEventsInPool(ze_event_pool_handle_t pool);
176230

231+
// Store USM allocator context(internal allocator structures)
232+
// for USM shared/host and device allocations. There is 1 allocator context
233+
// per each pair of (context, device) per each memory type.
234+
std::unordered_map<pi_device, USMAllocContext> SharedMemAllocContexts;
235+
std::unordered_map<pi_device, USMAllocContext> DeviceMemAllocContexts;
236+
177237
private:
178238
// Following member variables are used to manage assignment of events
179239
// to event pools.

0 commit comments

Comments
 (0)