-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Promote nestedGEP allocas to vectors #141199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Harrison Hao (harrisonGPU) ChangesSupports the %SortedFragments = alloca [10 x <2 x i32>], addrspace(5), align 8
%row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %SortedFragments, i32 0, i32 %j
%elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4
%val = load i32, ptr addrspace(5) %elt1 The pass folds the two levels of addressing into a single vector lane %vec = freeze <20 x i32> poison ; alloca promote <20 x i32>
%idx0 = mul i32 %j, 2 ; j * 2
%idx = add i32 %idx0, 1 ; j * 2 + 1
%val = extractelement <20 x i32> %vec, i32 %idx This eliminates the scratch read and leaves no LDS traffic. Full diff: https://github.com/llvm/llvm-project/pull/141199.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 517d05f6514d5..4355f195f2c88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -437,8 +437,62 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
SmallMapVector<Value *, APInt, 4> VarOffsets;
APInt ConstOffset(BW, 0);
- if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
- !GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
+
+ // Walk backwards through nested GEPs to collect both constant and variable
+ // offsets, so that nested vector GEP chains can be lowered in one step.
+ //
+ // Given this IR fragment as input:
+ //
+ // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
+ // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
+ // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
+ // %3 = load i32, ptr addrspace(5) %2, align 4
+ //
+ // Combine both GEP operations in a single pass, producing:
+ // BasePtr = %0
+ // ConstOffset = 4
+ // VarOffsets = { %j → element_size(<2 x i32>) }
+ //
+ // That lets us emit a single buffer_load directly into a VGPR, without ever
+ // allocating scratch memory for the intermediate pointer.
+ Value *CurPtr = GEP;
+ while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
+ SmallMapVector<Value *, APInt, 4> LocalVarsOffsets;
+ APInt LocalConstOffset(BW, 0);
+
+ if (!CurGEP->collectOffset(DL, BW, LocalVarsOffsets, LocalConstOffset))
+ return nullptr;
+
+ // Merge any variable-index contributions into the accumulated VarOffsets
+ // map.
+ // Only a single pointer variable is allowed in the entire GEP chain.
+ // If VarOffsets already holds a different pointer, abort.
+ //
+ // Example:
+ // Suppose LocalVarsOffsets = { (%ptr → 4) } from this GEP, and
+ // VarOffsets already has { (%ptr → 8) } from an inner GEP.
+ // After this loop, VarOffsets should become { (%ptr → 12) }.
+ for (auto &VarEntry : LocalVarsOffsets) {
+ // If VarOffsets already records a different pointer, abort.
+ if (!VarOffsets.empty() && !VarOffsets.count(VarEntry.first))
+ return nullptr;
+
+ // Look up whether we’ve seen this pointer before.
+ auto *Existing = VarOffsets.find(VarEntry.first);
+ if (Existing == VarOffsets.end())
+ VarOffsets.insert({VarEntry.first, VarEntry.second});
+ else
+ Existing->second += VarEntry.second;
+ }
+
+ ConstOffset += LocalConstOffset;
+
+ // Move to the next outer pointer
+ CurPtr = CurGEP->getPointerOperand()->stripPointerCasts();
+ }
+
+ // Only proceed if this GEP stems from the same alloca.
+ if (CurPtr->stripPointerCasts() != Alloca)
return nullptr;
unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal.ll b/llvm/test/CodeGen/AMDGPU/amdpal.ll
index 2e47b0163aa8c..695e80a7cc0a6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefixes=PAL,CI --enable-var-scope %s
-; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefixes=PAL,VI --enable-var-scope %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti | FileCheck --check-prefixes=PAL --enable-var-scope %s
+; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga | FileCheck --check-prefixes=PAL --enable-var-scope %s
; PAL-NOT: .AMDGPU.config
; PAL-LABEL: {{^}}simple:
@@ -51,17 +51,12 @@ entry:
ret void
}
-; Check code sequence for amdpal use of scratch for alloca in a compute shader.
-; The scratch descriptor is loaded from offset 0x10 of the GIT, rather than offset
-; 0 in a graphics shader.
-; Prior to GCN3 s_load_dword offsets are dwords, so the offset will be 0x4.
+; After the change that **promotes the alloca to a vector** (GEP‑of‑GEP
+; promotion), no scratch buffer is needed, so the descriptor load should
+; disappear.
; PAL-LABEL: {{^}}scratch2_cs:
-; PAL: s_movk_i32 s{{[0-9]+}}, 0x1234
-; PAL: s_mov_b32 s[[GITPTR:[0-9]+]], s0
-; CI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x4
-; VI: s_load_dwordx4 s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s[[[GITPTR]]:{{[0-9]+\]}}, 0x10
-; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC]]:
+; PAL: buffer_store{{.*}}, s[[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}
define amdgpu_cs void @scratch2_cs(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32> %coord, <2 x i32> %in, i32 %extra, i32 %idx) #0 {
entry:
@@ -88,6 +83,6 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32,
; PAL-NEXT: .cs:
; PAL-NEXT: .entry_point: _amdgpu_cs_main
; PAL-NEXT: .entry_point_symbol: scratch2_cs
-; PAL-NEXT: .scratch_memory_size: 0x10
+; PAL-NEXT: .scratch_memory_size: 0
; PAL-NEXT: .sgpr_count: 0x
; PAL-NEXT: .vgpr_count: 0x
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
new file mode 100644
index 0000000000000..40a81d04d09c8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
+target triple = "amdgcn-amd-amdhsa"
+define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %j) #0 {
+; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(
+; CHECK-SAME: i32 [[J:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[SORTEDFRAGMENTS:%.*]] = freeze <20 x i32> poison
+; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[J]], 2
+; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[J]], 2
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 1, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <20 x i32> [[SORTEDFRAGMENTS]], i32 [[TMP2]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %SortedFragments = alloca [10 x <2 x i32>], align 8, addrspace(5)
+ %0 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %SortedFragments, i32 0, i32 %j
+ %1 = getelementptr i8, ptr addrspace(5) %0, i32 4
+ %2 = load i32, ptr addrspace(5) %1, align 4
+ ret void
+}
+
+attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" }
+
+define amdgpu_cs void @scalar_alloca_ptr_with_vector_gep_of_scratch(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32> %coord, <2 x i32> %in, i32 %extra, i32 %idx) #1 {
+; CHECK-LABEL: define amdgpu_cs void @scalar_alloca_ptr_with_vector_gep_of_scratch(
+; CHECK-SAME: i32 inreg [[TMP0:%.*]], i32 inreg [[TMP1:%.*]], i32 inreg [[TMP2:%.*]], <3 x i32> inreg [[TMP3:%.*]], i32 inreg [[TMP4:%.*]], <3 x i32> [[COORD:%.*]], <2 x i32> [[IN:%.*]], i32 [[EXTRA:%.*]], i32 [[IDX:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: [[V:%.*]] = freeze <3 x i32> poison
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> [[V]], i32 [[EXTRA]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[IN]], i64 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i32> [[TMP5]], i32 [[TMP6]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[IN]], i64 1
+; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i32> [[TMP7]], i32 [[TMP8]], i32 2
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 1, [[IDX]]
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <3 x i32> [[TMP9]], i32 [[TMP10]]
+; CHECK-NEXT: [[XF:%.*]] = bitcast i32 [[TMP11]] to float
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float [[XF]], ptr addrspace(8) poison, i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret void
+;
+entry:
+ %v = alloca [3 x i32], addrspace(5)
+ %v1 = getelementptr [3 x i32], ptr addrspace(5) %v, i32 0, i32 1
+ store i32 %extra, ptr addrspace(5) %v
+ store <2 x i32> %in, ptr addrspace(5) %v1
+ %e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx
+ %x = load i32, ptr addrspace(5) %e
+ %xf = bitcast i32 %x to float
+ call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %xf, ptr addrspace(8) poison, i32 0, i32 0, i32 0)
+ ret void
+}
+
+attributes #1 = { nounwind "amdgpu-git-ptr-high"="0x1234" }
+
+declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
|
%0 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %SortedFragments, i32 0, i32 %j | ||
%1 = getelementptr i8, ptr addrspace(5) %0, i32 4 | ||
%2 = load i32, ptr addrspace(5) %1, align 4 | ||
ret void |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to make this not dead code. Also use named values in tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have updated it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is still dead code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean not use alloca after load? right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understood your mind! Thanks !
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have updated it, what do you think about it? :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"dead code" in this case refers to the fact that this alloca has no stores, no valid data.
This means the result of the load will always be poison, so load and all associated instructions can be removed if we made the compiler infer that.
Have a look at other existing promote alloca tests for some ideas of putting data into alloca.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have updated it, and I inserted valid data to alloca, what do you think about it?
llvm/test/CodeGen/AMDGPU/amdpal.ll
Outdated
; After the change that **promotes the alloca to a vector** (GEP‑of‑GEP | ||
; promotion), no scratch buffer is needed, so the descriptor load should | ||
; disappear. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This defeats the purpose of the test. You need to defeat the stack optimization and keep the stack usage
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, I have updated file check.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or disable promote alloca for this test.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to see this test scratch memory change to zero, so I think it is necessary to change, what do you think about it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As previously stated. The test is intended to check scratch descriptor is correctly accessed, thus it must access scratch. Test should not change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have updated it and disable promote alloca for amdpal.ll
llvm/test/CodeGen/AMDGPU/amdpal.ll
Outdated
; After the change that **promotes the alloca to a vector** (GEP‑of‑GEP | ||
; promotion), no scratch buffer is needed, so the descriptor load should | ||
; disappear. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or disable promote alloca for this test.
} | ||
|
||
// Only proceed if this GEP stems from the same alloca. | ||
if (CurPtr != Alloca) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How would we end up in this situation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. It won't actually end up here, but I'd like to add an assert to verify that it directly references the alloca.
llvm/test/CodeGen/AMDGPU/amdpal.ll
Outdated
; After the change that **promotes the alloca to a vector** (GEP‑of‑GEP | ||
; promotion), no scratch buffer is needed, so the descriptor load should | ||
; disappear. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As previously stated. The test is intended to check scratch descriptor is correctly accessed, thus it must access scratch. Test should not change.
llvm/test/CodeGen/AMDGPU/amdpal.ll
Outdated
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tahiti -disable-promote-alloca-to-vector | FileCheck --check-prefixes=PAL,CI --enable-var-scope %s | ||
; RUN: llc < %s -mtriple=amdgcn--amdpal -mcpu=tonga -disable-promote-alloca-to-vector | FileCheck --check-prefixes=PAL,VI --enable-var-scope %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to just make the access volatile
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I have updated it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mostly LGTM
CurPtr = CurGEP->getPointerOperand(); | ||
} | ||
|
||
assert(CurPtr == Alloca && "GEP not based on alloca"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure this is safe to assert?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am sure it is safe ! Because it is from the following code:
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines 364 to 370 in c3656af
for (AllocaInst *AI : Allocas) { | |
const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); | |
// First, check if we have enough budget to vectorize this alloca. | |
if (AllocaCost <= VectorizationBudget) { | |
// If we do, attempt vectorization, otherwise, fall through and try | |
// promoting to LDS instead. | |
if (tryPromoteAllocaToVector(*AI)) { |
So it is must alloca type.
CurPtr = CurGEP->getPointerOperand(); | ||
} | ||
|
||
assert(CurPtr == Alloca && "GEP not based on alloca"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we need this assertion here? should it be an assertion or error (which returns a nullptr)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It won't actually case assertion , but I'd like to add an assert to verify that it directly references the alloca.
503b83c
to
a23e944
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Supports the
nestedGEP
pattern thatappears when an alloca is first indexed as an array element and then
shifted with a byte‑offset GEP:
The pass folds the two levels of addressing into a single vector lane
index and keeps the whole object in a VGPR:
This eliminates the scratch read.