-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Generalize amdgcn.make.buffer.rsrc to fat pointers #126828
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-llvm-ir Author: Krzysztof Drewniak (krzysz00) ChangesWhile attempting to use ptr addrspace(7), I discovered that InferAddressSpaces would fold away the addrspace 8-to-7 cast that was the original operator for converting from buffer resources to buffer fat pointers. To resolve this problem, and thus prevent illegal The logic for handling a make.buffer.rsrc in instruction selection remains untouched and expects the output type to be a ptr addrspace(8), as does the Clang lowering for its builtin (the pointer-to-pointer version might want a different name in clang). LowerBufferFatPointers has been updated to lower Patch is 48.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126828.diff 17 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 361e4c4bf2e2e..46d93b533d608 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20712,9 +20712,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
return emitBuiltinWithOneOverloadedType<4>(*this, E,
Intrinsic::amdgcn_bitop3);
- case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
- return emitBuiltinWithOneOverloadedType<4>(
- *this, E, Intrinsic::amdgcn_make_buffer_rsrc);
+ case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
+ // Note: LLVM has this overloaded to allow for fat pointers, but since
+ // those haven't been plumbed through to Clang yet, default to creating the
+ // resource type.
+ SmallVector<Value *, 4> Args;
+ for (unsigned I = 0; I < 4; ++I)
+ Args.push_back(CGF.EmitScalarExpr(E->getArg(I)));
+ Type *RetTy = llvm::PointerType::get(Builder.getContext(),
+ llvm::AMDGPUAS::BUFFER_RESOURCE);
+ Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
+ {RsrcTy, Args[0]->getType()});
+ return Builder.createCall(F, Args);
+ }
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index c1a30633f3d0a..2342fcefb5f89 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -25,7 +25,7 @@
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +73,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +97,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 2c7bc10fb609c..29093c09c39d0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,7 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +13,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +31,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +40,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +58,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +67,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +76,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +85,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 84980d0c31d4f..31f72a9571720 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -990,7 +990,12 @@ supported for the ``amdgcn`` target.
the stride must be 0, the "add tid" flag must be 0, the swizzle enable bits
must be off, and the extent must be measured in bytes. (On subtargets where
bounds checking may be disabled, buffer fat pointers may choose to enable
- it or not).
+ it or not). The cache swizzle support introduced in gfx942 may be used.
+
+ These pointers can be created by `addrspacecast` from a buffer resource
+ (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+ `ptr addrspace(7)` directly, which produces a buffer fat pointer with an initial
+ offset of 0 and prevents the address space cast from being rewritten away.
**Buffer Resource**
The buffer resource pointer, in address space 8, is the newer form
@@ -1027,6 +1032,12 @@ supported for the ``amdgcn`` target.
the stride is the size of a structured element, the "add tid" flag must be 0,
and the swizzle enable bits must be off.
+ These pointers can be created by `addrspacecast` from a buffer resource
+ (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+ `ptr addrspace(9)` directly, which produces a buffer strided pointer whose initial
+ index and offset values are both 0. This prevents the address space cast from
+ being rewritten away.
+
**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eb7bde6999491..cf3d801d57366 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1284,11 +1284,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
// Data type for buffer resources (V#). Maybe, in the future, we can create a
// similar one for textures (T#).
def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>;
+// Data type for buffer fat pointers, which are a buffer resource (V#) followed by
+// a 32-bit offset. These don't exist in hardware and are a compiler-internal
+// convenience.
+def AMDGPUBufferFatPointerTy : LLVMQualPointerType<7>;
let TargetPrefix = "amdgcn" in {
+// Create a buffer resource wraping `base` with the specified `stride`
+// `numrecords`, and `flags`. All of these values will need to be
+// wave-uniform when the buffer instructions are invoked, so non-uniform
+// inputs to this intrinsic will trigger waterfall loops.
+//
+// In addition to creating ptr addrspace(8), whe representation of buffer
+// resources, it can create the fat pointers ptr addrspace(7) and ptr addrspace(9),,
+// which carry additional offset bits. When this intrinsic is used to create
+// these fat pointers, their offset and index fields (if applicable) are zero.
def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
- [AMDGPUBufferRsrcTy],
+ [llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
llvm_i32_ty, // NumRecords / extent
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e886a6012b219..91d447eb3ed01 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -29,6 +29,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/IntrinsicsRISCV.h"
@@ -1072,6 +1073,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
{F->getReturnType(), F->getArg(1)->getType()});
return true;
}
+ // Old-style make.buffer.rsrc was only variadic in the input pointer
+ if (Name.consume_front("make.buffer.rsrc.") && Name.size() == 2) {
+ // Intrinsic was made more variadic.
+ NewFn = Intrinsic::getOrInsertDeclaration(
+ F->getParent(), Intrinsic::amdgcn_make_buffer_rsrc,
+ {F->getReturnType(), F->getArg(0)->getType()});
+ return true;
+ }
break; // No other 'amdgcn.*'
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index ccb874e6a934e..d1c9382c61ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2067,6 +2067,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
switch (IID) {
default:
return false;
+ case Intrinsic::amdgcn_make_buffer_rsrc:
case Intrinsic::ptrmask:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
@@ -2081,6 +2082,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
switch (IID) {
default:
break;
+ case Intrinsic::amdgcn_make_buffer_rsrc: {
+ if (!isSplitFatPtr(I.getType()))
+ return {nullptr, nullptr};
+ Value *Base = I.getArgOperand(0);
+ Value *Stride = I.getArgOperand(1);
+ Value *NumRecords = I.getArgOperand(2);
+ Value *Flags = I.getArgOperand(3);
+ auto *SplitType = cast<StructType>(I.getType());
+ Type *RsrcType = SplitType->getElementType(0);
+ Type *OffType = SplitType->getElementType(1);
+ IRB.SetInsertPoint(&I);
+ Value *Rsrc = IRB.CreateIntrinsic(IID, {RsrcType, Base->getType()},
+ {Base, Stride, NumRecords, Flags});
+ copyMetadata(Rsrc, &I);
+ Rsrc->takeName(&I);
+ Value *Zero = Constant::getNullValue(OffType);
+ SplitUsers.insert(&I);
+ return {Rsrc, Zero};
+ }
case Intrinsic::ptrmask: {
Value *Ptr = I.getArgOperand(0);
if (!isSplitFatPtr(Ptr->getType()))
diff --git a/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll b/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll
new file mode 100644
index 0000000000000..cb36a57072157
--- /dev/null
+++ b/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define ptr addrspace(8) @old_call(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c) {
+ ; CHECK: %call = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c)
+ ; CHECK-NOT: amdgcn.make.buffer.rsrc.p1
+ %call = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c)
+ ret ptr addrspace(8) %call
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)
+; CHECK: declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone, i16, i32, i32) #0
+; CHECK-NOT: amdgcn.make.buffer.rsrc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
index 4a151aeca87e4..6171c73d8d2dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
@@ -25,7 +25,7 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
- %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678)
+ %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678)
ret ptr addrspace(8) %rsrc
}
@@ -43,7 +43,7 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) {
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+ %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
%loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
ret float %loaded
}
@@ -74,7 +74,7 @@ define amdgpu_ps ptr addrspace(8) @basic_struct...
[truncated]
|
@llvm/pr-subscribers-clang-codegen Author: Krzysztof Drewniak (krzysz00) ChangesWhile attempting to use ptr addrspace(7), I discovered that InferAddressSpaces would fold away the addrspace 8-to-7 cast that was the original operator for converting from buffer resources to buffer fat pointers. To resolve this problem, and thus prevent illegal The logic for handling a make.buffer.rsrc in instruction selection remains untouched and expects the output type to be a ptr addrspace(8), as does the Clang lowering for its builtin (the pointer-to-pointer version might want a different name in clang). LowerBufferFatPointers has been updated to lower Patch is 48.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126828.diff 17 Files Affected:
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 361e4c4bf2e2e..46d93b533d608 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20712,9 +20712,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
return emitBuiltinWithOneOverloadedType<4>(*this, E,
Intrinsic::amdgcn_bitop3);
- case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
- return emitBuiltinWithOneOverloadedType<4>(
- *this, E, Intrinsic::amdgcn_make_buffer_rsrc);
+ case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
+ // Note: LLVM has this overloaded to allow for fat pointers, but since
+ // those haven't been plumbed through to Clang yet, default to creating the
+ // resource type.
+ SmallVector<Value *, 4> Args;
+ for (unsigned I = 0; I < 4; ++I)
+ Args.push_back(CGF.EmitScalarExpr(E->getArg(I)));
+ Type *RetTy = llvm::PointerType::get(Builder.getContext(),
+ llvm::AMDGPUAS::BUFFER_RESOURCE);
+ Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
+ {RsrcTy, Args[0]->getType()});
+ return Builder.createCall(F, Args);
+ }
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index c1a30633f3d0a..2342fcefb5f89 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -25,7 +25,7 @@
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT: [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP4]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +73,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +97,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT: [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP3]]
//
__device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 2c7bc10fb609c..29093c09c39d0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,7 @@
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +13,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +31,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +40,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +58,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +67,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
// CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +76,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
// CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +85,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
// CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
// CHECK-NEXT: ret ptr addrspace(8) [[TMP0]]
//
__amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 84980d0c31d4f..31f72a9571720 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -990,7 +990,12 @@ supported for the ``amdgcn`` target.
the stride must be 0, the "add tid" flag must be 0, the swizzle enable bits
must be off, and the extent must be measured in bytes. (On subtargets where
bounds checking may be disabled, buffer fat pointers may choose to enable
- it or not).
+ it or not). The cache swizzle support introduced in gfx942 may be used.
+
+ These pointers can be created by `addrspacecast` from a buffer resource
+ (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+ `ptr addrspace(7)` directly, which produces a buffer fat pointer with an initial
+ offset of 0 and prevents the address space cast from being rewritten away.
**Buffer Resource**
The buffer resource pointer, in address space 8, is the newer form
@@ -1027,6 +1032,12 @@ supported for the ``amdgcn`` target.
the stride is the size of a structured element, the "add tid" flag must be 0,
and the swizzle enable bits must be off.
+ These pointers can be created by `addrspacecast` from a buffer resource
+ (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+ `ptr addrspace(9)` directly, which produces a buffer strided pointer whose initial
+ index and offset values are both 0. This prevents the address space cast from
+ being rewritten away.
+
**Streamout Registers**
Dedicated registers used by the GS NGG Streamout Instructions. The register
file is modelled as a memory in a distinct address space because it is indexed
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eb7bde6999491..cf3d801d57366 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1284,11 +1284,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
// Data type for buffer resources (V#). Maybe, in the future, we can create a
// similar one for textures (T#).
def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>;
+// Data type for buffer fat pointers, which are a buffer resource (V#) followed by
+// a 32-bit offset. These don't exist in hardware and are a compiler-internal
+// convenience.
+def AMDGPUBufferFatPointerTy : LLVMQualPointerType<7>;
let TargetPrefix = "amdgcn" in {
+// Create a buffer resource wraping `base` with the specified `stride`
+// `numrecords`, and `flags`. All of these values will need to be
+// wave-uniform when the buffer instructions are invoked, so non-uniform
+// inputs to this intrinsic will trigger waterfall loops.
+//
+// In addition to creating ptr addrspace(8), whe representation of buffer
+// resources, it can create the fat pointers ptr addrspace(7) and ptr addrspace(9),,
+// which carry additional offset bits. When this intrinsic is used to create
+// these fat pointers, their offset and index fields (if applicable) are zero.
def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
- [AMDGPUBufferRsrcTy],
+ [llvm_anyptr_ty],
[llvm_anyptr_ty, // base
llvm_i16_ty, // stride (and swizzle control)
llvm_i32_ty, // NumRecords / extent
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e886a6012b219..91d447eb3ed01 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -29,6 +29,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#include "llvm/IR/IntrinsicsRISCV.h"
@@ -1072,6 +1073,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
{F->getReturnType(), F->getArg(1)->getType()});
return true;
}
+ // Old-style make.buffer.rsrc was only variadic in the input pointer
+ if (Name.consume_front("make.buffer.rsrc.") && Name.size() == 2) {
+ // Intrinsic was made more variadic.
+ NewFn = Intrinsic::getOrInsertDeclaration(
+ F->getParent(), Intrinsic::amdgcn_make_buffer_rsrc,
+ {F->getReturnType(), F->getArg(0)->getType()});
+ return true;
+ }
break; // No other 'amdgcn.*'
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index ccb874e6a934e..d1c9382c61ed1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2067,6 +2067,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
switch (IID) {
default:
return false;
+ case Intrinsic::amdgcn_make_buffer_rsrc:
case Intrinsic::ptrmask:
case Intrinsic::invariant_start:
case Intrinsic::invariant_end:
@@ -2081,6 +2082,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
switch (IID) {
default:
break;
+ case Intrinsic::amdgcn_make_buffer_rsrc: {
+ if (!isSplitFatPtr(I.getType()))
+ return {nullptr, nullptr};
+ Value *Base = I.getArgOperand(0);
+ Value *Stride = I.getArgOperand(1);
+ Value *NumRecords = I.getArgOperand(2);
+ Value *Flags = I.getArgOperand(3);
+ auto *SplitType = cast<StructType>(I.getType());
+ Type *RsrcType = SplitType->getElementType(0);
+ Type *OffType = SplitType->getElementType(1);
+ IRB.SetInsertPoint(&I);
+ Value *Rsrc = IRB.CreateIntrinsic(IID, {RsrcType, Base->getType()},
+ {Base, Stride, NumRecords, Flags});
+ copyMetadata(Rsrc, &I);
+ Rsrc->takeName(&I);
+ Value *Zero = Constant::getNullValue(OffType);
+ SplitUsers.insert(&I);
+ return {Rsrc, Zero};
+ }
case Intrinsic::ptrmask: {
Value *Ptr = I.getArgOperand(0);
if (!isSplitFatPtr(Ptr->getType()))
diff --git a/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll b/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll
new file mode 100644
index 0000000000000..cb36a57072157
--- /dev/null
+++ b/llvm/test/Bitcode/amdgcn-make.buffer.rsrc.ll
@@ -0,0 +1,12 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define ptr addrspace(8) @old_call(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c) {
+ ; CHECK: %call = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c)
+ ; CHECK-NOT: amdgcn.make.buffer.rsrc.p1
+ %call = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 %a, i32 %b, i32 %c)
+ ret ptr addrspace(8) %call
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone, i16, i32, i32)
+; CHECK: declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone, i16, i32, i32) #0
+; CHECK-NOT: amdgcn.make.buffer.rsrc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
index 4a151aeca87e4..6171c73d8d2dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
@@ -25,7 +25,7 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
; CHECK-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
; CHECK-NEXT: $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
- %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678)
+ %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678)
ret ptr addrspace(8) %rsrc
}
@@ -43,7 +43,7 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) {
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
- %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+ %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
%loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
ret float %loaded
}
@@ -74,7 +74,7 @@ define amdgpu_ps ptr addrspace(8) @basic_struct...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The clang and llvm part look good to me.
I'll go ahead and note that the MLIR changes are, from where I'm standing, a trivial update that just follows the LLVM changes, so I'm considering landing this tomorrow after I fixed the comments on the LLVM parts. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a testcase that infer address spaces was breaking? I don't think that should happen. Do you need to modify rewriteIntrinsicWithAddressSpace?
@arsenm Will try and reduce up a reproducer tomorrow morning However, the shape of the problem was that we'd end up with Now, there's an argument that the |
@arsenm On further investigation, I misdiagnosed the issue and have updated the commit message accordingly. The real problem is the addrspacecast(addrspacecast(x)) => addrspacecast(x)` fold that was getting rid of the fat pointer intermediate, and then infer-address-spaces did what it was meant to do. |
Squashed commit that'll take its value from github pr desc
908411c
to
ed8ff36
Compare
) Attempting to pass a `ptr addrspace(7)` to functions that take `ptr` arguments produces undesirable `addrspacecast(addrspacecast(p8 x to p7) to p0) => addrspacecast(p8 x to p0)` folds. This results in illegal GEP operations on buffer resources, which can't be GEP'd. (However, note that, while unimplemneted, addressspacecast from ptr addrspace(7) to ptr is legal - it's just an effective address computation) To resolve this problem, and thus prevent illegal `getelementptr T, ptr addrspace(8) %x, ...` s from being produces, this commit extends amdgcn.make.buffer.rsrc to also be variadic in its result type, auto-upgrading old manglings. The logic for handling a make.buffer.rsrc in instruction selection remains untouched and expects the output type to be a ptr addrspace(8), as does the Clang lowering for its builtin (the pointer-to-pointer version might want a different name in clang). LowerBufferFatPointers has been updated to lower amdgcn.make.buffer.rsrc.p7.p* to amdgcn.make.buffer.rsrc.p8.p* . This'll also make exposing buffer fat pointers in Clang easier, since you don't have to cast between a `__amdgcn_rsrc_t` and a pointer.
Attempting to pass a
ptr addrspace(7)
to functions that takeptr
arguments produces undesirableaddrspacecast(addrspacecast(p8 x to p7) to p0) => addrspacecast(p8 x to p0)
folds. This results in illegal GEP operations on buffer resources, which can't be GEP'd. (However, note that, while unimplemneted, addressspacecast from ptr addrspace(7) to ptr is legal - it's just an effective address computation)To resolve this problem, and thus prevent illegal
getelementptr T, ptr addrspace(8) %x, ...
s from being produces, this commit extends amdgcn.make.buffer.rsrc to also be variadic in its result type, auto-upgrading old manglings.The logic for handling a make.buffer.rsrc in instruction selection remains untouched and expects the output type to be a ptr addrspace(8), as does the Clang lowering for its builtin (the pointer-to-pointer version might want a different name in clang). LowerBufferFatPointers has been updated to lower
amdgcn.make.buffer.rsrc.p7.p* to amdgcn.make.buffer.rsrc.p8.p* .
This'll also make exposing buffer fat pointers in Clang easier, since you don't have to cast between a
__amdgcn_rsrc_t
and a pointer.