Skip to content

[SYCL][UR][CUDA] Move CUDA device memory pools to the context #17411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions unified-runtime/source/adapters/cuda/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
//===----------------------------------------------------------------------===//

#include "common.hpp"
#include "device.hpp"
#include "logger/ur_logger.hpp"
#include "umf_helpers.hpp"

#include <cuda.h>

Expand Down Expand Up @@ -129,3 +131,84 @@ void setPluginSpecificMessage(CUresult cu_res) {
setErrorMessage(message, UR_RESULT_ERROR_ADAPTER_SPECIFIC);
free(message);
}

namespace umf {

ur_result_t getProviderNativeError(const char *, int32_t) {
// TODO: implement when UMF supports CUDA
return UR_RESULT_ERROR_UNKNOWN;
}

// Create UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST)
// from a device
ur_result_t
createHostMemoryProvider(CUcontext contextCUDA,
umf_memory_provider_handle_t *memoryProviderHost) {
*memoryProviderHost = nullptr;

umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
umf_result_t UmfResult =
umfCUDAMemoryProviderParamsCreate(&CUMemoryProviderParams);
UMF_RETURN_UR_ERROR(UmfResult);

OnScopeExit Cleanup(
[=]() { umfCUDAMemoryProviderParamsDestroy(CUMemoryProviderParams); });

UmfResult =
umf::setCUMemoryProviderParams(CUMemoryProviderParams, 0 /* cuDevice */,
contextCUDA, UMF_MEMORY_TYPE_HOST);
UMF_RETURN_UR_ERROR(UmfResult);

// create UMF CUDA memory provider and pool for the host memory
// (UMF_MEMORY_TYPE_HOST)
UmfResult = umfMemoryProviderCreate(
umfCUDAMemoryProviderOps(), CUMemoryProviderParams, memoryProviderHost);
UMF_RETURN_UR_ERROR(UmfResult);

return UR_RESULT_SUCCESS;
}

// Create UMF CUDA memory providers for the device memory (UMF_MEMORY_TYPE_HOST)
// and the shared memory (UMF_MEMORY_TYPE_SHARED)
ur_result_t createDeviceMemoryProviders(
ur_device_handle_t_ *DeviceHandle,
umf_memory_provider_handle_t *memoryDeviceProvider,
umf_memory_provider_handle_t *memorySharedProvider) {
umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;

umf_result_t UmfResult =
umfCUDAMemoryProviderParamsCreate(&CUMemoryProviderParams);
UMF_RETURN_UR_ERROR(UmfResult);

OnScopeExit Cleanup(
[=]() { umfCUDAMemoryProviderParamsDestroy(CUMemoryProviderParams); });

CUdevice device = DeviceHandle->get();
CUcontext context = DeviceHandle->getNativeContext();

// create UMF CUDA memory provider for the device memory
// (UMF_MEMORY_TYPE_DEVICE)
UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParams, device,
context, UMF_MEMORY_TYPE_DEVICE);
UMF_RETURN_UR_ERROR(UmfResult);

*memoryDeviceProvider = nullptr;
UmfResult = umfMemoryProviderCreate(
umfCUDAMemoryProviderOps(), CUMemoryProviderParams, memoryDeviceProvider);
UMF_RETURN_UR_ERROR(UmfResult);

// create UMF CUDA memory provider for the shared memory
// (UMF_MEMORY_TYPE_SHARED)
UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParams, device,
context, UMF_MEMORY_TYPE_SHARED);
UMF_RETURN_UR_ERROR(UmfResult);

*memorySharedProvider = nullptr;
UmfResult = umfMemoryProviderCreate(
umfCUDAMemoryProviderOps(), CUMemoryProviderParams, memorySharedProvider);
UMF_RETURN_UR_ERROR(UmfResult);

return UR_RESULT_SUCCESS;
}

} // namespace umf
11 changes: 11 additions & 0 deletions unified-runtime/source/adapters/cuda/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ void assertion(bool Condition, const char *Message = nullptr);

namespace umf {

ur_result_t getProviderNativeError(const char *, int32_t);

inline umf_result_t setCUMemoryProviderParams(
umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams,
int cuDevice, void *cuContext, umf_usm_memory_type_t memType) {
Expand All @@ -92,4 +94,13 @@ inline umf_result_t setCUMemoryProviderParams(
return UMF_RESULT_SUCCESS;
}

ur_result_t
createHostMemoryProvider(CUcontext contextCUDA,
umf_memory_provider_handle_t *memoryProviderHost);

ur_result_t
createDeviceMemoryProviders(ur_device_handle_t_ *DeviceHandle,
umf_memory_provider_handle_t *memoryDeviceProvider,
umf_memory_provider_handle_t *memorySharedProvider);

} // namespace umf
115 changes: 74 additions & 41 deletions unified-runtime/source/adapters/cuda/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,38 +76,6 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
///
///

static ur_result_t
CreateHostMemoryProviderPool(ur_device_handle_t_ *DeviceHandle,
umf_memory_provider_handle_t *MemoryProviderHost,
umf_memory_pool_handle_t *MemoryPoolHost) {

*MemoryProviderHost = nullptr;
CUcontext context = DeviceHandle->getNativeContext();

umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
umf_result_t UmfResult =
umfCUDAMemoryProviderParamsCreate(&CUMemoryProviderParams);
UMF_RETURN_UR_ERROR(UmfResult);
OnScopeExit Cleanup(
[=]() { umfCUDAMemoryProviderParamsDestroy(CUMemoryProviderParams); });

UmfResult = umf::setCUMemoryProviderParams(
CUMemoryProviderParams, 0 /* cuDevice */, context, UMF_MEMORY_TYPE_HOST);
UMF_RETURN_UR_ERROR(UmfResult);

// create UMF CUDA memory provider and pool for the host memory
// (UMF_MEMORY_TYPE_HOST)
UmfResult = umfMemoryProviderCreate(
umfCUDAMemoryProviderOps(), CUMemoryProviderParams, MemoryProviderHost);
UMF_RETURN_UR_ERROR(UmfResult);

UmfResult = umfPoolCreate(umfProxyPoolOps(), *MemoryProviderHost, nullptr, 0,
MemoryPoolHost);
UMF_RETURN_UR_ERROR(UmfResult);

return UR_RESULT_SUCCESS;
}

struct ur_context_handle_t_ {

struct deleter_data {
Expand All @@ -120,30 +88,42 @@ struct ur_context_handle_t_ {
std::vector<ur_device_handle_t> Devices;
std::atomic_uint32_t RefCount;

// UMF CUDA memory provider and pool for the host memory
// UMF CUDA memory pool for the host memory
// (UMF_MEMORY_TYPE_HOST)
umf_memory_provider_handle_t MemoryProviderHost = nullptr;
umf_memory_pool_handle_t MemoryPoolHost = nullptr;

// UMF CUDA memory pools for the device memory
// (UMF_MEMORY_TYPE_DEVICE)
std::vector<umf_memory_pool_handle_t> MemoryDevicePools;

// UMF CUDA memory pools for the shared memory
// (UMF_MEMORY_TYPE_SHARED)
std::vector<umf_memory_pool_handle_t> MemorySharedPools;

ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
: Devices{Devs, Devs + NumDevices}, RefCount{1} {
for (auto &Dev : Devices) {
urDeviceRetain(Dev);
}

// Create UMF CUDA memory provider for the host memory
// (UMF_MEMORY_TYPE_HOST) from any device (Devices[0] is used here, because
// it is guaranteed to exist).
UR_CHECK_ERROR(CreateHostMemoryProviderPool(Devices[0], &MemoryProviderHost,
&MemoryPoolHost));
// Create UMF CUDA memory provider and pool for the host memory
// (UMF_MEMORY_TYPE_HOST)
UR_CHECK_ERROR(createHostMemoryPool());

// Create UMF CUDA memory providers and pools for the device memory
// (UMF_MEMORY_TYPE_HOST) and the shared memory (UMF_MEMORY_TYPE_SHARED).
UR_CHECK_ERROR(createDeviceMemoryPools());
};

~ur_context_handle_t_() {
if (MemoryPoolHost) {
umfPoolDestroy(MemoryPoolHost);
}
if (MemoryProviderHost) {
umfMemoryProviderDestroy(MemoryProviderHost);
for (auto &Pool : MemoryDevicePools) {
umfPoolDestroy(Pool);
}
for (auto &Pool : MemorySharedPools) {
umfPoolDestroy(Pool);
}
for (auto &Dev : Devices) {
urDeviceRelease(Dev);
Expand Down Expand Up @@ -190,6 +170,59 @@ struct ur_context_handle_t_ {
std::mutex Mutex;
std::vector<deleter_data> ExtendedDeleters;
std::set<ur_usm_pool_handle_t> PoolHandles;

// Create UMF CUDA memory pool for the host memory (UMF_MEMORY_TYPE_HOST)
ur_result_t createHostMemoryPool() {
umf_memory_provider_handle_t memoryProviderHost = nullptr;
ur_result_t URResult = umf::createHostMemoryProvider(
Devices[0]->getNativeContext(), &memoryProviderHost);
if (URResult != UR_RESULT_SUCCESS) {
return URResult;
}

umf_result_t UmfResult =
umfPoolCreate(umfProxyPoolOps(), memoryProviderHost, nullptr,
UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &MemoryPoolHost);
UMF_RETURN_UR_ERROR(UmfResult);

return UR_RESULT_SUCCESS;
}

// Create UMF CUDA memory pools for the device memory (UMF_MEMORY_TYPE_HOST)
// and the shared memory (UMF_MEMORY_TYPE_SHARED)
ur_result_t createDeviceMemoryPools() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you can't just store a ur_usm_pool_handle_t_ in the context? The ctor of ur_usm_pool_handle_t_ does pretty much exactly the same thing as this and createHostMemoryPool function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ur_usm_pool_handle_t_::ur_usm_pool_handle_t_ stores user Disjoint Pools in the set of pools std::set<ur_usm_pool_handle_t> PoolHandles;. But I want to store the default Proxy Pools in the vectors

  std::vector<umf_memory_pool_handle_t> MemoryDevicePools;
  std::vector<umf_memory_pool_handle_t> MemorySharedPools;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so to me it looks like the ur_usm_pool_handle_t_ is implemented incorrectly. Look at how, urUSM*Alloc is implemented: https://github.com/intel/llvm/blob/sycl/unified-runtime/source/adapters/cuda/usm.cpp#L71 We are using the DevicePool, without even looking at the hDevice param. The ur_usm_pool_handle_t_ should consist of a vector of memory pools for device and shared allocations just as you implemented in context right now.

I know fixing this, will require a bit more work but it would make the code much simpler if we could just store ur_usm_pool_handle_t_ in the context (not to mention that right now, custom memory pools doesn't seem to work correctly). I won't block this PR but this should be fixed sooner rather than later.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll echo this.
I've a patch locally that creates a pool for different sets of allocation flags (write-combine, non-coherent, etc), and that gets the provider from the pool where needed using umfPoolGetMemoryProvider.

I'm happy for this to go in as-is for now, though as I can build on top of this work

for (auto &Device : Devices) {
umf_memory_provider_handle_t memoryDeviceProvider = nullptr;
umf_memory_provider_handle_t memorySharedProvider = nullptr;
ur_result_t URResult = umf::createDeviceMemoryProviders(
Device, &memoryDeviceProvider, &memorySharedProvider);
if (URResult != UR_RESULT_SUCCESS) {
return URResult;
}

// create UMF CUDA memory pool for the device memory
// (UMF_MEMORY_TYPE_DEVICE)
umf_result_t UmfResult;
umf_memory_pool_handle_t memoryDevicePool = nullptr;
UmfResult =
umfPoolCreate(umfProxyPoolOps(), memoryDeviceProvider, nullptr,
UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &memoryDevicePool);
UMF_RETURN_UR_ERROR(UmfResult);

// create UMF CUDA memory pool for the shared memory
// (UMF_MEMORY_TYPE_SHARED)
umf_memory_pool_handle_t memorySharedPool = nullptr;
UmfResult =
umfPoolCreate(umfProxyPoolOps(), memorySharedProvider, nullptr,
UMF_POOL_CREATE_FLAG_OWN_PROVIDER, &memorySharedPool);
UMF_RETURN_UR_ERROR(UmfResult);

MemoryDevicePools.push_back(memoryDevicePool);
MemorySharedPools.push_back(memorySharedPool);
}

return UR_RESULT_SUCCESS;
}
};

namespace {
Expand Down
31 changes: 1 addition & 30 deletions unified-runtime/source/adapters/cuda/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,28 +82,9 @@ struct ur_device_handle_t_ {
// CUDA doesn't really have this concept, and could allow almost 100% of
// global memory in one allocation, but is dependent on device usage.
UR_CHECK_ERROR(cuDeviceTotalMem(&MaxAllocSize, cuDevice));

MemoryProviderDevice = nullptr;
MemoryProviderShared = nullptr;
MemoryPoolDevice = nullptr;
MemoryPoolShared = nullptr;
}

~ur_device_handle_t_() {
if (MemoryPoolDevice) {
umfPoolDestroy(MemoryPoolDevice);
}
if (MemoryPoolShared) {
umfPoolDestroy(MemoryPoolShared);
}
if (MemoryProviderDevice) {
umfMemoryProviderDestroy(MemoryProviderDevice);
}
if (MemoryProviderShared) {
umfMemoryProviderDestroy(MemoryProviderShared);
}
cuDevicePrimaryCtxRelease(CuDevice);
}
~ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); }

native_type get() const noexcept { return CuDevice; };

Expand Down Expand Up @@ -139,16 +120,6 @@ struct ur_device_handle_t_ {

// bookkeeping for mipmappedArray leaks in Mapping external Memory
std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;

// UMF CUDA memory provider and pool for the device memory
// (UMF_MEMORY_TYPE_DEVICE)
umf_memory_provider_handle_t MemoryProviderDevice;
umf_memory_pool_handle_t MemoryPoolDevice;

// UMF CUDA memory provider and pool for the shared memory
// (UMF_MEMORY_TYPE_SHARED)
umf_memory_provider_handle_t MemoryProviderShared;
umf_memory_pool_handle_t MemoryPoolShared;
};

int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
6 changes: 4 additions & 2 deletions unified-runtime/source/adapters/cuda/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
const ur_device_handle_t hDevice) {
ScopedContext Active(hDevice);
auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice);
ur_context_handle_t Context = Mem->getContext();
auto DeviceIdx = Context->getDeviceIndex(hDevice);
ur_lock LockGuard(Mem->MemoryAllocationMutex);

if (Mem->isBuffer()) {
Expand All @@ -442,7 +443,8 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
CU_MEMHOSTALLOC_DEVICEMAP));
UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
} else {
*(void **)&DevPtr = umfPoolMalloc(hDevice->MemoryPoolDevice, Buffer.Size);
*(void **)&DevPtr =
umfPoolMalloc(Context->MemoryDevicePools[DeviceIdx], Buffer.Size);
UMF_CHECK_PTR(*(void **)&DevPtr);
}
} else {
Expand Down
Loading