Skip to content

Conversation

@jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Sep 19, 2025

Summary:
This was originally added in as a hack to work around CUDA's limitation
on allocation. The libc implementation now isn't even used for CUDA so
this code is never hit. Even if this case, this code never truly worked.

A true solution would be to use CUDA's virtual memory API instead to
allocate 2MiB slabs independenctly from the normal memory management
done in the stream.

@llvmbot
Copy link
Member

llvmbot commented Sep 19, 2025

@llvm/pr-subscribers-offload

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

Changes

Summary:
This was originally added in as a hack to work around CUDA's limitation
on allocation. The libc implementation now isn't even used for CUDA so
this code is never hit. Even if this case, this code never truly worked.

A true solution would be to use CUDA's virtual memory API instead to
allocate 2MiB slabs independenctly from the normal memory management
done in the stream.


Full diff: https://github.com/llvm/llvm-project/pull/159851.diff

7 Files Affected:

  • (modified) offload/include/omptarget.h (-2)
  • (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (-2)
  • (modified) offload/plugins-nextgen/common/include/ErrorReporting.h (-1)
  • (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (-2)
  • (modified) offload/plugins-nextgen/common/src/RPC.cpp (+2-2)
  • (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (-19)
  • (modified) offload/plugins-nextgen/host/src/rtl.cpp (-1)
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 8fd722bb15022..794b79e07674e 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -101,8 +101,6 @@ enum TargetAllocTy : int32_t {
   TARGET_ALLOC_HOST,
   TARGET_ALLOC_SHARED,
   TARGET_ALLOC_DEFAULT,
-  /// The allocation will not block on other streams.
-  TARGET_ALLOC_DEVICE_NON_BLOCKING,
 };
 
 inline KernelArgsTy CTorDTorKernelArgs = {
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 277fad29fd24f..64470e9fabf46 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2350,7 +2350,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     switch (Kind) {
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemoryPool = CoarseGrainedMemoryPools[0];
       break;
     case TARGET_ALLOC_HOST:
@@ -3847,7 +3846,6 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
   case TARGET_ALLOC_DEVICE:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     MemoryPool = CoarseGrainedMemoryPools[0];
     break;
   case TARGET_ALLOC_HOST:
diff --git a/offload/plugins-nextgen/common/include/ErrorReporting.h b/offload/plugins-nextgen/common/include/ErrorReporting.h
index 2ad0f2b7dd6c6..68d82cbea0f3c 100644
--- a/offload/plugins-nextgen/common/include/ErrorReporting.h
+++ b/offload/plugins-nextgen/common/include/ErrorReporting.h
@@ -61,7 +61,6 @@ class ErrorReporter {
   /// Return a nice name for an TargetAllocTy.
   static StringRef getAllocTyName(TargetAllocTy Kind) {
     switch (Kind) {
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
     case TARGET_ALLOC_DEFAULT:
     case TARGET_ALLOC_DEVICE:
       return "device memory";
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index b1955b53b80e5..30b5db782370d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1295,7 +1295,6 @@ Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
 
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
   case TARGET_ALLOC_DEVICE:
     if (MemoryManager) {
       Alloc = MemoryManager->allocate(Size, HostPtr);
@@ -1386,7 +1385,6 @@ Error GenericDeviceTy::dataDelete(void *TgtPtr, TargetAllocTy Kind) {
   int Res;
   switch (Kind) {
   case TARGET_ALLOC_DEFAULT:
-  case TARGET_ALLOC_DEVICE_NON_BLOCKING:
   case TARGET_ALLOC_DEVICE:
     if (MemoryManager) {
       Res = MemoryManager->free(TgtPtr);
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 678be78b56af6..bd25a6f1017a8 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -29,14 +29,14 @@ rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
   case LIBC_MALLOC: {
     Port.recv_and_send([&](rpc::Buffer *Buffer, uint32_t) {
       Buffer->data[0] = reinterpret_cast<uintptr_t>(Device.allocate(
-          Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE_NON_BLOCKING));
+          Buffer->data[0], nullptr, TARGET_ALLOC_DEVICE));
     });
     break;
   }
   case LIBC_FREE: {
     Port.recv([&](rpc::Buffer *Buffer, uint32_t) {
       Device.free(reinterpret_cast<void *>(Buffer->data[0]),
-                  TARGET_ALLOC_DEVICE_NON_BLOCKING);
+                  TARGET_ALLOC_DEVICE);
     });
     break;
   }
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index fdd470fcd8113..b2f840113cff3 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -587,16 +587,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
       Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
       MemAlloc = (void *)DevicePtr;
       break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
-        break;
-      cuStreamSynchronize(Stream);
-      Res = cuStreamDestroy(Stream);
-      MemAlloc = (void *)DevicePtr;
-    }
     }
 
     if (auto Err =
@@ -627,15 +617,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_HOST:
       Res = cuMemFreeHost(TgtPtr);
       break;
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
-      CUstream Stream;
-      if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
-        break;
-      cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
-      cuStreamSynchronize(Stream);
-      if ((Res = cuStreamDestroy(Stream)))
-        break;
-    }
     }
 
     if (auto Err = Plugin::check(Res, "error in cuMemFree[Host]: %s")) {
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 0db01ca09ab02..44e2584fe53cc 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -250,7 +250,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     case TARGET_ALLOC_DEVICE:
     case TARGET_ALLOC_HOST:
     case TARGET_ALLOC_SHARED:
-    case TARGET_ALLOC_DEVICE_NON_BLOCKING:
       MemAlloc = std::malloc(Size);
       break;
     }

@github-actions
Copy link

github-actions bot commented Sep 19, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

Summary:
This was originally added in as a hack to work around CUDA's limitation
on allocation. The `libc` implementation now isn't even used for CUDA so
this code is never hit. Even if this case, this code never truly worked.

A true solution would be to use CUDA's virtual memory API instead to
allocate 2MiB slabs independenctly from the normal memory management
done in the stream.
@llvm llvm deleted a comment from shupach Sep 20, 2025
Copy link
Contributor

@jplehr jplehr left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@jhuber6 jhuber6 merged commit 23efc67 into llvm:main Sep 20, 2025
9 checks passed
@jhuber6 jhuber6 deleted the RemoveNonBlocking branch September 20, 2025 14:07
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants