Skip to content

Commit 96b8e6d

Browse files
committed
AMDGPU: Implement bitcode autoupgrade for old style enqueue blocks
Introduces circular dependency in build for appendToUsed, and I'm not sure it's worth the trouble to fix it. We can most likely get away without upgrading this. We could move appendToUsed / appendToCompilerUsed directly to be in Module.
1 parent b4c7b92 commit 96b8e6d

File tree

3 files changed

+188
-0
lines changed

3 files changed

+188
-0
lines changed

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#include "llvm/Support/ErrorHandling.h"
4949
#include "llvm/Support/Regex.h"
5050
#include "llvm/TargetParser/Triple.h"
51+
#include "llvm/Transforms/Utils/ModuleUtils.h"
5152
#include <cstdint>
5253
#include <cstring>
5354
#include <numeric>
@@ -5518,6 +5519,51 @@ struct AMDGPUUnsafeFPAtomicsUpgradeVisitor
55185519
};
55195520
} // namespace
55205521

5522+
static StructType *getAMDGPURuntimeHandleType(LLVMContext &C,
5523+
Type *KernelDescriptorPtrTy) {
5524+
Type *Int32 = Type::getInt32Ty(C);
5525+
return StructType::create(C, {KernelDescriptorPtrTy, Int32, Int32},
5526+
"block.runtime.handle.t");
5527+
}
5528+
5529+
/// Rewrite to new scheme for enqueued block lowering
5530+
static void upgradeAMDGPUKernelEnqueuedBlock(Function &F) {
5531+
if (F.isMaterializable()) {
5532+
// A verifier error is produced if we add metadata to the function during
5533+
// linking.
5534+
return;
5535+
}
5536+
5537+
const StringLiteral EnqueuedBlockName("enqueued-block");
5538+
if (!F.hasFnAttribute(EnqueuedBlockName))
5539+
return;
5540+
5541+
F.removeFnAttr(EnqueuedBlockName);
5542+
5543+
Module *M = F.getParent();
5544+
LLVMContext &Ctx = M->getContext();
5545+
const DataLayout &DL = M->getDataLayout();
5546+
5547+
StructType *HandleTy = getAMDGPURuntimeHandleType(
5548+
Ctx, PointerType::get(Ctx, DL.getDefaultGlobalsAddressSpace()));
5549+
5550+
Twine RuntimeHandleName = F.getName() + ".runtime.handle";
5551+
5552+
auto *RuntimeHandle = new GlobalVariable(
5553+
*M, HandleTy,
5554+
/*isConstant=*/true, F.getLinkage(),
5555+
/*Initializer=*/ConstantAggregateZero::get(HandleTy), RuntimeHandleName,
5556+
/*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
5557+
DL.getDefaultGlobalsAddressSpace(),
5558+
/*isExternallyInitialized=*/true);
5559+
RuntimeHandle->setSection(".amdgpu.kernel.runtime.handle");
5560+
5561+
MDNode *HandleAsMD = MDNode::get(Ctx, ValueAsMetadata::get(RuntimeHandle));
5562+
F.setMetadata(LLVMContext::MD_associated, HandleAsMD);
5563+
5564+
appendToUsed(*M, {&F, RuntimeHandle});
5565+
}
5566+
55215567
void llvm::UpgradeFunctionAttributes(Function &F) {
55225568
// If a function definition doesn't have the strictfp attribute,
55235569
// convert any callsite strictfp attributes to nobuiltin.
@@ -5558,6 +5604,9 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
55585604
F.removeFnAttr("amdgpu-unsafe-fp-atomics");
55595605
}
55605606
}
5607+
5608+
if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
5609+
upgradeAMDGPUKernelEnqueuedBlock(F);
55615610
}
55625611

55635612
static bool isOldLoopArgument(Metadata *MD) {

llvm/lib/IR/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ add_llvm_component_library(LLVMCore
9292
LINK_COMPONENTS
9393
BinaryFormat
9494
Demangle
95+
TransformUtils
9596
Remarks
9697
Support
9798
TargetParser
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
2+
3+
%struct.ndrange_t = type { i32 }
4+
%opencl.queue_t = type opaque
5+
6+
; CHECK: %block.runtime.handle.t = type { ptr, i32, i32 }
7+
; CHECK: %block.runtime.handle.t.0 = type { ptr, i32, i32 }
8+
; CHECK: %block.runtime.handle.t.1 = type { ptr, i32, i32 }
9+
; CHECK: %block.runtime.handle.t.2 = type { ptr, i32, i32 }
10+
; CHECK: %block.runtime.handle.t.3 = type { ptr, i32, i32 }
11+
; CHECK: %block.runtime.handle.t.4 = type { ptr, i32, i32 }
12+
13+
14+
; CHECK: @kernel_address_user = global [1 x ptr] [ptr @block_has_used_kernel_address]
15+
; CHECK: @__test_block_invoke_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t zeroinitializer, section ".amdgpu.kernel.runtime.handle"
16+
; CHECK: @__test_block_invoke_2_kernel.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.0 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
17+
; CHECK: @block_has_used_kernel_address.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.1 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
18+
; CHECK: @.runtime.handle = internal externally_initialized constant %block.runtime.handle.t.2 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
19+
; CHECK: @.runtime.handle.1 = internal externally_initialized constant %block.runtime.handle.t.3 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
20+
; CHECK: @kernel_linkonce_odr_block.runtime.handle = linkonce_odr externally_initialized constant %block.runtime.handle.t.4 zeroinitializer, section ".amdgpu.kernel.runtime.handle"
21+
; CHECK: @llvm.used = appending global [12 x ptr] [ptr @__test_block_invoke_kernel, ptr @__test_block_invoke_kernel.runtime.handle, ptr @__test_block_invoke_2_kernel, ptr @__test_block_invoke_2_kernel.runtime.handle, ptr @block_has_used_kernel_address, ptr @block_has_used_kernel_address.runtime.handle, ptr @0, ptr @.runtime.handle, ptr @1, ptr @.runtime.handle.1, ptr @kernel_linkonce_odr_block, ptr @kernel_linkonce_odr_block.runtime.handle], section "llvm.metadata"
22+
23+
24+
define amdgpu_kernel void @non_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
25+
ret void
26+
}
27+
28+
define amdgpu_kernel void @caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
29+
entry:
30+
%block = alloca <{ i32, i32, ptr addrspace(1), i8 }>, align 8, addrspace(5)
31+
%inst = alloca %struct.ndrange_t, align 4, addrspace(5)
32+
%block2 = alloca <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5)
33+
%inst3 = alloca %struct.ndrange_t, align 4, addrspace(5)
34+
%block.size = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 0
35+
store i32 25, ptr addrspace(5) %block.size, align 8
36+
%block.align = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 1
37+
store i32 8, ptr addrspace(5) %block.align, align 4
38+
%block.captured = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 2
39+
store ptr addrspace(1) %a, ptr addrspace(5) %block.captured, align 8
40+
%block.captured1 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), i8 }>, ptr addrspace(5) %block, i32 0, i32 3
41+
store i8 %b, ptr addrspace(5) %block.captured1, align 8
42+
%inst4 = addrspacecast ptr addrspace(5) %block to ptr
43+
%inst5 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
44+
ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
45+
%inst10 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
46+
ptr @__test_block_invoke_kernel, ptr nonnull %inst4) #2
47+
%inst11 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
48+
ptr @0, ptr nonnull %inst4) #2
49+
%inst12 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst,
50+
ptr @1, ptr nonnull %inst4) #2
51+
%block.size4 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 0
52+
store i32 41, ptr addrspace(5) %block.size4, align 8
53+
%block.align5 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 1
54+
store i32 8, ptr addrspace(5) %block.align5, align 4
55+
%block.captured7 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 2
56+
store ptr addrspace(1) %a, ptr addrspace(5) %block.captured7, align 8
57+
%block.captured8 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 5
58+
store i8 %b, ptr addrspace(5) %block.captured8, align 8
59+
%block.captured9 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 3
60+
store ptr addrspace(1) %c, ptr addrspace(5) %block.captured9, align 8
61+
%block.captured10 = getelementptr inbounds <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr addrspace(5) %block2, i32 0, i32 4
62+
store i64 %d, ptr addrspace(5) %block.captured10, align 8
63+
%inst8 = addrspacecast ptr addrspace(5) %block2 to ptr
64+
%inst9 = call i32 @__enqueue_kernel_basic(ptr addrspace(1) poison, i32 0, ptr addrspace(5) byval(%struct.ndrange_t) nonnull %inst3,
65+
ptr @__test_block_invoke_2_kernel, ptr nonnull %inst8) #2
66+
ret void
67+
}
68+
69+
; __enqueue_kernel* functions may get inlined
70+
define amdgpu_kernel void @inlined_caller(ptr addrspace(1) %a, i8 %b, ptr addrspace(1) %c, i64 %d) {
71+
entry:
72+
%inst = load i64, ptr addrspace(1) addrspacecast (ptr @__test_block_invoke_kernel to ptr addrspace(1))
73+
store i64 %inst, ptr addrspace(1) %c
74+
ret void
75+
}
76+
77+
; CHECK: define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !0 {
78+
define internal amdgpu_kernel void @__test_block_invoke_kernel(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
79+
entry:
80+
%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
81+
%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
82+
store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
83+
ret void
84+
}
85+
86+
declare i32 @__enqueue_kernel_basic(ptr addrspace(1), i32, ptr addrspace(5), ptr, ptr) local_unnamed_addr
87+
88+
; CHECK: define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) !associated !1 {
89+
define internal amdgpu_kernel void @__test_block_invoke_2_kernel(<{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg) #0 {
90+
entry:
91+
%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 2
92+
%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 3
93+
%.fca.5.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 4
94+
%.fca.6.extract = extractvalue <{ i32, i32, ptr addrspace(1), ptr addrspace(1), i64, i8 }> %arg, 5
95+
store i8 %.fca.6.extract, ptr addrspace(1) %.fca.3.extract, align 1
96+
store i64 %.fca.5.extract, ptr addrspace(1) %.fca.4.extract, align 8
97+
ret void
98+
}
99+
100+
@kernel_address_user = global [1 x ptr] [ ptr @block_has_used_kernel_address ]
101+
102+
; CHECK: define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !2 {
103+
define internal amdgpu_kernel void @block_has_used_kernel_address(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
104+
entry:
105+
%.fca.3.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 2
106+
%.fca.4.extract = extractvalue <{ i32, i32, ptr addrspace(1), i8 }> %arg, 3
107+
store i8 %.fca.4.extract, ptr addrspace(1) %.fca.3.extract, align 1
108+
ret void
109+
}
110+
111+
define amdgpu_kernel void @user_of_kernel_address(ptr addrspace(1) %arg) {
112+
store ptr @block_has_used_kernel_address, ptr addrspace(1) %arg
113+
ret void
114+
}
115+
116+
; CHECK: define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !3 {
117+
define internal amdgpu_kernel void @0(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
118+
ret void
119+
}
120+
121+
; CHECK: define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) !associated !4 {
122+
define internal amdgpu_kernel void @1(<{ i32, i32, ptr addrspace(1), i8 }> %arg) #0 {
123+
ret void
124+
}
125+
126+
; CHECK: define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() !associated !5 {
127+
define linkonce_odr amdgpu_kernel void @kernel_linkonce_odr_block() #0 {
128+
ret void
129+
}
130+
131+
attributes #0 = { "enqueued-block" }
132+
133+
; CHECK: !0 = !{ptr @__test_block_invoke_kernel.runtime.handle}
134+
; CHECK: !1 = !{ptr @__test_block_invoke_2_kernel.runtime.handle}
135+
; CHECK: !2 = !{ptr @block_has_used_kernel_address.runtime.handle}
136+
; CHECK: !3 = !{ptr @.runtime.handle}
137+
; CHECK: !4 = !{ptr @.runtime.handle.1}
138+
; CHECK: !5 = !{ptr @kernel_linkonce_odr_block.runtime.handle}

0 commit comments

Comments
 (0)