AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks #128520

arsenm · 2025-02-24T14:25:05Z

Introduces circular dependency in build for appendToUsed, and I'm not
sure it's worth the trouble to fix it. We can most likely get away
without upgrading this. We could move appendToUsed / appendToCompilerUsed
directly to be in Module.

arsenm · 2025-02-24T14:25:22Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-02-24T14:27:01Z

@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-clang-codegen

Author: Matt Arsenault (arsenm)

Changes

Introduces circular dependency in build for appendToUsed, and I'm not
sure it's worth the trouble to fix it. We can most likely get away
without upgrading this. We could move appendToUsed / appendToCompilerUsed
directly to be in Module.

Full diff: https://github.com/llvm/llvm-project/pull/128520.diff

3 Files Affected:

(modified) llvm/lib/IR/AutoUpgrade.cpp (+50)
(modified) llvm/lib/IR/CMakeLists.txt (+1)
(added) llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll (+138)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 57072715366c9..4f8ccb7eac96d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -46,6 +46,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
 #include <cstring>
 #include <numeric>
 
@@ -5466,6 +5468,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor
 };
 } // namespace
 
+static StructType *getAMDGPURuntimeHandleType(LLVMContext &C,
+                                              Type *KernelDescriptorPtrTy) {
+  Type *Int32 = Type::getInt32Ty(C);
+  return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
+                            "block.runtime.handle.t");
+}
+
+/// Rewrite to new scheme for enqueued block lowering
+static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) {
+  if (F.isMaterializable()) {
+    // A verifier error is produced if we add metadata to the function during
+    // linking.
+    return;
+  }
+
+  const StringLiteral EnqueuedBlockName("enqueued-block");
+  if (!F.hasFnAttribute(EnqueuedBlockName))
+    return;
+
+  F.removeFnAttr(EnqueuedBlockName);
+
+  Module *M = F.getParent();
+  LLVMContext &Ctx = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+
+  StructType *HandleTy = getAMDGPURuntimeHandleType(
+      Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace()));
+
+  Twine RuntimeHandleName = F.getName() + ".runtime.handle";
+
+  auto *RuntimeHandle = new GlobalVariable(
+      *M, HandleTy,
+      /*isConstant=*/true, F.getLinkage(),
+      /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName,
+      /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+      DL.getDefaultGlobalsAddressSpace(),
+      /*isExternallyInitialized=*/true);
+  RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
+
+  MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle));
+  F.setMetadata(LLVMContext::MD_associated, HandleAsMD);
+
+  appendToUsed(*M, {&F, RuntimeHandle});
+}
+
 void llvm::UpgradeFunctionAttributes(Function &F) {
   // If a function definition doesn't have the strictfp attribute,
   // convert any callsite strictfp attributes to nobuiltin.
@@ -5506,6 +5553,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
       F.removeFnAttr("amdgpu-unsafe-fp-atomics");
     }
   }
+
+  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+    upgradeAMDGPUKernelEnqueuedBlock(F);
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index eb00829fd8c70..a78c58c807f6a 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore
   LINK_COMPONENTS
   BinaryFormat
   Demangle
+  TransformUtils
   Remarks
   Support
   TargetParser
diff --git a/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll
new file mode 100644
index 0000000000000..649d95e77588b
--- /dev/null
+++ b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll
@@ -0,0 +1,138 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+%struct.ndrange_t = type { i32 }
+%opencl.queue_t = type opaque
+
+; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 }
+
+
+; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address]
+; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @block_has_used_kernel_address.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @.runtime.handle.1 = internal externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @kernel_linkonce_odr_block.runtime.handle = linkonce_odr externally_initialized constant %block.runtime.handle.t.4 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @llvm.used = appending global [12 x ptr] [ptr @__test_block_invoke_kernel, ptr @__test_block_invoke_kernel.runtime.handle, ptr @__test_block_invoke_2_kernel, ptr @__test_block_invoke_2_kernel.runtime.handle, ptr @block_has_used_kernel_address, ptr @block_has_used_kernel_address.runtime.handle, ptr @0, ptr @.runtime.handle, ptr @1, ptr @.runtime.handle.1, ptr @kernel_linkonce_odr_block, ptr @kernel_linkonce_odr_block.runtime.handle], section "llvm.metadata"
+
+
+define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+  ret void
+}
+
+define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+entry:
+  %block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+  %inst = alloca %struct.ndrange_t, align 4, addrspace(5)
+  %block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+  %inst3 = alloca %struct.ndrange_t, align 4, addrspace(5)
+  %block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0
+  store i32 25, ptr addrspace(5) %block.size, align 8
+  %block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1
+  store i32 8, ptr addrspace(5) %block.align, align 4
+  %block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2
+  store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8
+  %block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3
+  store i8 %b, ptr addrspace(5) %block.captured1, align 8
+  %inst4 = addrspacecast ptr addrspace(5) %block to ptr
+  %inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
+  %inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
+  %inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @0, ptr nonnull %inst4) #2
+  %inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @1, ptr nonnull %inst4) #2
+  %block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0
+  store i32 41, ptr addrspace(5) %block.size4, align 8
+  %block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1
+  store i32 8, ptr addrspace(5) %block.align5, align 4
+  %block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2
+  store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8
+  %block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5
+  store i8 %b, ptr addrspace(5) %block.captured8, align 8
+  %block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3
+  store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8
+  %block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4
+  store i64 %d, ptr addrspace(5) %block.captured10, align 8
+  %inst8 = addrspacecast ptr addrspace(5) %block2 to ptr
+  %inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3,
+  ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2
+  ret void
+}
+
+; __enqueue_kernel* functions may get inlined
+define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+entry:
+  %inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1))
+  store i64 %inst, ptr addrspace(1) %c
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !0 {
+define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
+  store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
+
+; CHECK: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) !associated !1 {
+define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3
+  %.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4
+  %.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5
+  store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8
+  ret void
+}
+
+@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ]
+
+; CHECK: define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !2 {
+define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
+  store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  ret void
+}
+
+define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) {
+  store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !3 {
+define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !4 {
+define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+  ret void
+}
+
+; CHECK: define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() !associated !5 {
+define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() #0 {
+  ret void
+}
+
+attributes #0 = { "enqueued-block" }
+
+; CHECK: !0 = !{ptr @__test_block_invoke_kernel.runtime.handle}
+; CHECK: !1 = !{ptr @__test_block_invoke_2_kernel.runtime.handle}
+; CHECK: !2 = !{ptr @block_has_used_kernel_address.runtime.handle}
+; CHECK: !3 = !{ptr @.runtime.handle}
+; CHECK: !4 = !{ptr @.runtime.handle.1}
+; CHECK: !5 = !{ptr @kernel_linkonce_odr_block.runtime.handle}

llvmbot · 2025-02-24T14:27:02Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Introduces circular dependency in build for appendToUsed, and I'm not
sure it's worth the trouble to fix it. We can most likely get away
without upgrading this. We could move appendToUsed / appendToCompilerUsed
directly to be in Module.

Full diff: https://github.com/llvm/llvm-project/pull/128520.diff

3 Files Affected:

(modified) llvm/lib/IR/AutoUpgrade.cpp (+50)
(modified) llvm/lib/IR/CMakeLists.txt (+1)
(added) llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll (+138)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 57072715366c9..4f8ccb7eac96d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -46,6 +46,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
 #include <cstring>
 #include <numeric>
 
@@ -5466,6 +5468,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor
 };
 } // namespace
 
+static StructType *getAMDGPURuntimeHandleType(LLVMContext &C,
+                                              Type *KernelDescriptorPtrTy) {
+  Type *Int32 = Type::getInt32Ty(C);
+  return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
+                            "block.runtime.handle.t");
+}
+
+/// Rewrite to new scheme for enqueued block lowering
+static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) {
+  if (F.isMaterializable()) {
+    // A verifier error is produced if we add metadata to the function during
+    // linking.
+    return;
+  }
+
+  const StringLiteral EnqueuedBlockName("enqueued-block");
+  if (!F.hasFnAttribute(EnqueuedBlockName))
+    return;
+
+  F.removeFnAttr(EnqueuedBlockName);
+
+  Module *M = F.getParent();
+  LLVMContext &Ctx = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
+
+  StructType *HandleTy = getAMDGPURuntimeHandleType(
+      Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace()));
+
+  Twine RuntimeHandleName = F.getName() + ".runtime.handle";
+
+  auto *RuntimeHandle = new GlobalVariable(
+      *M, HandleTy,
+      /*isConstant=*/true, F.getLinkage(),
+      /*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName,
+      /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
+      DL.getDefaultGlobalsAddressSpace(),
+      /*isExternallyInitialized=*/true);
+  RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
+
+  MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle));
+  F.setMetadata(LLVMContext::MD_associated, HandleAsMD);
+
+  appendToUsed(*M, {&F, RuntimeHandle});
+}
+
 void llvm::UpgradeFunctionAttributes(Function &F) {
   // If a function definition doesn't have the strictfp attribute,
   // convert any callsite strictfp attributes to nobuiltin.
@@ -5506,6 +5553,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
       F.removeFnAttr("amdgpu-unsafe-fp-atomics");
     }
   }
+
+  if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+    upgradeAMDGPUKernelEnqueuedBlock(F);
 }
 
 static bool isOldLoopArgument(Metadata *MD) {
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index eb00829fd8c70..a78c58c807f6a 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore
   LINK_COMPONENTS
   BinaryFormat
   Demangle
+  TransformUtils
   Remarks
   Support
   TargetParser
diff --git a/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll
new file mode 100644
index 0000000000000..649d95e77588b
--- /dev/null
+++ b/llvm/test/Bitcode/amdgpu-autoupgrade-enqueued-block.ll
@@ -0,0 +1,138 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+%struct.ndrange_t = type { i32 }
+%opencl.queue_t = type opaque
+
+; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 }
+; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 }
+
+
+; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address]
+; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @block_has_used_kernel_address.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @.runtime.handle.1 = internal externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @kernel_linkonce_odr_block.runtime.handle = linkonce_odr externally_initialized constant %block.runtime.handle.t.4 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
+; CHECK: @llvm.used = appending global [12 x ptr] [ptr @__test_block_invoke_kernel, ptr @__test_block_invoke_kernel.runtime.handle, ptr @__test_block_invoke_2_kernel, ptr @__test_block_invoke_2_kernel.runtime.handle, ptr @block_has_used_kernel_address, ptr @block_has_used_kernel_address.runtime.handle, ptr @0, ptr @.runtime.handle, ptr @1, ptr @.runtime.handle.1, ptr @kernel_linkonce_odr_block, ptr @kernel_linkonce_odr_block.runtime.handle], section "llvm.metadata"
+
+
+define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+  ret void
+}
+
+define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+entry:
+  %block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
+  %inst = alloca %struct.ndrange_t, align 4, addrspace(5)
+  %block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
+  %inst3 = alloca %struct.ndrange_t, align 4, addrspace(5)
+  %block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0
+  store i32 25, ptr addrspace(5) %block.size, align 8
+  %block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1
+  store i32 8, ptr addrspace(5) %block.align, align 4
+  %block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2
+  store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8
+  %block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3
+  store i8 %b, ptr addrspace(5) %block.captured1, align 8
+  %inst4 = addrspacecast ptr addrspace(5) %block to ptr
+  %inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
+  %inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
+  %inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @0, ptr nonnull %inst4) #2
+  %inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
+  ptr @1, ptr nonnull %inst4) #2
+  %block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0
+  store i32 41, ptr addrspace(5) %block.size4, align 8
+  %block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1
+  store i32 8, ptr addrspace(5) %block.align5, align 4
+  %block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2
+  store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8
+  %block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5
+  store i8 %b, ptr addrspace(5) %block.captured8, align 8
+  %block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3
+  store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8
+  %block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4
+  store i64 %d, ptr addrspace(5) %block.captured10, align 8
+  %inst8 = addrspacecast ptr addrspace(5) %block2 to ptr
+  %inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) undef, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3,
+  ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2
+  ret void
+}
+
+; __enqueue_kernel* functions may get inlined
+define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
+entry:
+  %inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1))
+  store i64 %inst, ptr addrspace(1) %c
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !0 {
+define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
+  store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  ret void
+}
+
+declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
+
+; CHECK: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) !associated !1 {
+define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3
+  %.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4
+  %.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5
+  store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8
+  ret void
+}
+
+@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ]
+
+; CHECK: define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !2 {
+define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+entry:
+  %.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
+  %.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
+  store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
+  ret void
+}
+
+define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) {
+  store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !3 {
+define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+  ret void
+}
+
+; CHECK: define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !4 {
+define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
+  ret void
+}
+
+; CHECK: define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() !associated !5 {
+define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() #0 {
+  ret void
+}
+
+attributes #0 = { "enqueued-block" }
+
+; CHECK: !0 = !{ptr @__test_block_invoke_kernel.runtime.handle}
+; CHECK: !1 = !{ptr @__test_block_invoke_2_kernel.runtime.handle}
+; CHECK: !2 = !{ptr @block_has_used_kernel_address.runtime.handle}
+; CHECK: !3 = !{ptr @.runtime.handle}
+; CHECK: !4 = !{ptr @.runtime.handle.1}
+; CHECK: !5 = !{ptr @kernel_linkonce_odr_block.runtime.handle}

github-actions · 2025-02-24T14:28:21Z

✅ With the latest revision this PR passed the undef deprecator.

The previous implementation wasn't maintaining a faithful IR representation of how this really works. The value returned by createEnqueuedBlockKernel wasn't actually used as a function, and hacked up later to be a pointer to the runtime handle global variable. In reality, the enqueued block is a struct where the first field is a pointer to the kernel descriptor, not the kernel itself. We were also relying on passing around a reference to a global using a string attribute containing its name. It's better to base this on a proper IR symbol reference during final emission. This now avoids using a function attribute on kernels and avoids using the additional "runtime-handle" attribute to populate the final metadata. Instead, associate the runtime handle reference to the kernel with the !associated global metadata. We can then get a final, correctly mangled name at the end. I couldn't figure out how to get rename-with-external-symbol behavior using a combination of comdats and aliases, so leaves an IR pass to externalize the runtime handles for codegen. If anything breaks, it's most likely this, so leave avoiding this for a later step. Use a special section name to enable this behavior. This also means it's possible to declare enqueuable kernels in source without going through the dedicated block syntax or other dedicated compiler support. We could move towards initializing the runtime handle in the compiler/linker. I have a working patch where the linker sets up the first field of the handle, avoiding the need to export the block kernel symbol for the runtime. We would need new relocations to get the private and group sizes, but that would avoid the runtime's special case handling that requires the device_enqueue_symbol metadata field. https://reviews.llvm.org/D141700

Introduces circular dependency in build for appendToUsed, and I'm not sure it's worth the trouble to fix it. We can most likely get away without upgrading this. We could move appendToUsed / appendToCompilerUsed directly to be in Module.

arsenm · 2025-03-12T03:33:11Z

Not bothering with this

arsenm mentioned this pull request Feb 24, 2025

AMDGPU: Move enqueued block handling into clang #128519

Merged

arsenm added backend:AMDGPU clang:codegen IR generation bugs: mangling, exceptions, etc. OpenCL labels Feb 24, 2025 — with Graphite App

arsenm requested review from AnastasiaStulova, jhuber6, JonChesterfield, kzhuravl, rampitec, slinder1, ssahasra and yxsamliu February 24, 2025 14:26

arsenm marked this pull request as ready for review February 24, 2025 14:26

llvmbot added the llvm:ir label Feb 24, 2025

arsenm force-pushed the users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang branch from 7c230fb to 6b51198 Compare February 24, 2025 15:09

arsenm force-pushed the users/arsenm/amdgpu/bitcode-autoupgrade-old-enqueue-blocks branch 3 times, most recently from 25bdb58 to 364f49f Compare February 24, 2025 16:08

arsenm force-pushed the users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang branch from 8e9fe01 to 6e193cc Compare March 4, 2025 03:17

arsenm force-pushed the users/arsenm/amdgpu/bitcode-autoupgrade-old-enqueue-blocks branch 2 times, most recently from bcd2c36 to d2479a3 Compare March 6, 2025 16:22

arsenm force-pushed the users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang branch 2 times, most recently from e910f85 to b4c7b92 Compare March 7, 2025 00:36

arsenm force-pushed the users/arsenm/amdgpu/bitcode-autoupgrade-old-enqueue-blocks branch 2 times, most recently from 96b8e6d to e824520 Compare March 7, 2025 12:22

arsenm force-pushed the users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang branch from b4c7b92 to 1b5f2f2 Compare March 7, 2025 12:22

arsenm added 3 commits March 10, 2025 17:49

Fix typo

2de1a05

arsenm force-pushed the users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang branch from 1b5f2f2 to 2de1a05 Compare March 10, 2025 10:54

arsenm force-pushed the users/arsenm/amdgpu/bitcode-autoupgrade-old-enqueue-blocks branch from e824520 to 652675b Compare March 10, 2025 10:54

Base automatically changed from users/arsenm/amdgpu/move-opencl-enqueued-block-handling-into-clang to main March 10, 2025 12:54

arsenm closed this Mar 12, 2025

arsenm deleted the users/arsenm/amdgpu/bitcode-autoupgrade-old-enqueue-blocks branch April 25, 2025 11:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks #128520

AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks #128520

Uh oh!

arsenm commented Feb 24, 2025

Uh oh!

arsenm commented Feb 24, 2025 •

edited

Loading

Uh oh!

llvmbot commented Feb 24, 2025 •

edited

Loading

Uh oh!

llvmbot commented Feb 24, 2025

Uh oh!

github-actions bot commented Feb 24, 2025 •

edited

Loading

Uh oh!

arsenm commented Mar 12, 2025

Uh oh!

Uh oh!

AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks #128520

AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks #128520

Uh oh!

Conversation

arsenm commented Feb 24, 2025

Uh oh!

arsenm commented Feb 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 24, 2025

Uh oh!

github-actions bot commented Feb 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Mar 12, 2025

Uh oh!

Uh oh!

arsenm commented Feb 24, 2025 •

edited

Loading

llvmbot commented Feb 24, 2025 •

edited

Loading

github-actions bot commented Feb 24, 2025 •

edited

Loading