-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[Offload] Remove non-blocking allocation type #159851
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-offload @llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: A true solution would be to use CUDA's virtual memory API instead to Full diff: https://github.com/llvm/llvm-project/pull/159851.diff 7 Files Affected:
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 8fd722bb15022..794b79e07674e 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -101,8 +101,6 @@ enum TargetAllocTy : int32_t {
TARGET_ALLOC_HOST,
TARGET_ALLOC_SHARED,
TARGET_ALLOC_DEFAULT,
- /// The allocation will not block on other streams.
- TARGET_ALLOC_DEVICE_NON_BLOCKING,
};
inline KernelArgsTy CTorDTorKernelArgs = {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 277fad29fd24f..64470e9fabf46 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2350,7 +2350,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
@@ -3847,7 +3846,6 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemoryPool = CoarseGrainedMemoryPools[0];
break;
case TARGET_ALLOC_HOST:
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index 2ad0f2b7dd6c6..68d82cbea0f3c 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -61,7 +61,6 @@ class ErrorReporter {
/// Return a nice name for an TargetAllocTy.
static StringRef getAllocTyName(TargetAllocTy Kind) {
switch (Kind) {
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
case TARGET_ALLOC_DEFAULT:
case TARGET_ALLOC_DEVICE:
return "device memory";
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index b1955b53b80e5..30b5db782370d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1295,7 +1295,6 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
case TARGET_ALLOC_DEVICE:
if (MemoryManager) {
Alloc = MemoryManager->allocate(Size, HostPtr);
@@ -1386,7 +1385,6 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
int Res;
switch (Kind) {
case TARGET_ALLOC_DEFAULT:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
case TARGET_ALLOC_DEVICE:
if (MemoryManager) {
Res = MemoryManager->free(TgtPtr);
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 678be78b56af6..bd25a6f1017a8 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -29,14 +29,14 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
case LIBC_MALLOC: {
Port.recv_and_send([&](rpc::Buffer *Buffer, uint32_t) {
Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
- Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
+ Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
});
break;
}
case LIBC_FREE: {
Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
Device.free(reinterpret_cast<void *>(Buffer->data[0]),
- TARGET_ALLOC_DEVICE_NON_BLOCKING);
+ TARGET_ALLOC_DEVICE);
});
break;
}
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index fdd470fcd8113..b2f840113cff3 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -587,16 +587,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
MemAlloc = (void *)DevicePtr;
break;
- case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
- CUstream Stream;
- if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
- break;
- if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
- break;
- cuStreamSynchronize(Stream);
- Res = cuStreamDestroy(Stream);
- MemAlloc = (void *)DevicePtr;
- }
}
if (auto Err =
@@ -627,15 +617,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_HOST:
Res = cuMemFreeHost(TgtPtr);
break;
- case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
- CUstream Stream;
- if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
- break;
- cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
- cuStreamSynchronize(Stream);
- if ((Res = cuStreamDestroy(Stream)))
- break;
- }
}
if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 0db01ca09ab02..44e2584fe53cc 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -250,7 +250,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
case TARGET_ALLOC_DEVICE:
case TARGET_ALLOC_HOST:
case TARGET_ALLOC_SHARED:
- case TARGET_ALLOC_DEVICE_NON_BLOCKING:
MemAlloc = std::malloc(Size);
break;
}
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Summary: This was originally added in as a hack to work around CUDA's limitation on allocation. The `libc` implementation now isn't even used for CUDA so this code is never hit. Even if this case, this code never truly worked. A true solution would be to use CUDA's virtual memory API instead to allocate 2MiB slabs independenctly from the normal memory management done in the stream.
5345b8d to
38446b0
Compare
jplehr
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Summary:
This was originally added in as a hack to work around CUDA's limitation
on allocation. The
libcimplementation now isn't even used for CUDA sothis code is never hit. Even if this case, this code never truly worked.
A true solution would be to use CUDA's virtual memory API instead to
allocate 2MiB slabs independenctly from the normal memory management
done in the stream.