Skip to content

Commit 1a7f5f5

Browse files
authored
[AMDGPU] Promote nestedGEP allocas to vectors (#141199)
Supports the `nestedGEP`pattern that appears when an alloca is first indexed as an array element and then shifted with a byte‑offset GEP: ```llvm %SortedFragments = alloca [10 x <2 x i32>], addrspace(5), align 8 %row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %SortedFragments, i32 0, i32 %j %elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4 %val = load i32, ptr addrspace(5) %elt1 ``` The pass folds the two levels of addressing into a single vector lane index and keeps the whole object in a VGPR: ```llvm %vec = freeze <20 x i32> poison ; alloca promote <20 x i32> %idx0 = mul i32 %j, 2 ; j * 2 %idx = add i32 %idx0, 1 ; j * 2 + 1 %val = extractelement <20 x i32> %vec, i32 %idx ``` This eliminates the scratch read.
1 parent 58cc167 commit 1a7f5f5

File tree

3 files changed

+108
-4
lines changed

3 files changed

+108
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -437,9 +437,34 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
437437
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
438438
SmallMapVector<Value *, APInt, 4> VarOffsets;
439439
APInt ConstOffset(BW, 0);
440-
if (GEP->getPointerOperand()->stripPointerCasts() != Alloca ||
441-
!GEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
442-
return nullptr;
440+
441+
// Walk backwards through nested GEPs to collect both constant and variable
442+
// offsets, so that nested vector GEP chains can be lowered in one step.
443+
//
444+
// Given this IR fragment as input:
445+
//
446+
// %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
447+
// %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
448+
// %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
449+
// %3 = load i32, ptr addrspace(5) %2, align 4
450+
//
451+
// Combine both GEP operations in a single pass, producing:
452+
// BasePtr = %0
453+
// ConstOffset = 4
454+
// VarOffsets = { %j -> element_size(<2 x i32>) }
455+
//
456+
// That lets us emit a single buffer_load directly into a VGPR, without ever
457+
// allocating scratch memory for the intermediate pointer.
458+
Value *CurPtr = GEP;
459+
while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
460+
if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
461+
return nullptr;
462+
463+
// Move to the next outer pointer.
464+
CurPtr = CurGEP->getPointerOperand();
465+
}
466+
467+
assert(CurPtr == Alloca && "GEP not based on alloca");
443468

444469
unsigned VecElemSize = DL.getTypeAllocSize(VecElemTy);
445470
if (VarOffsets.size() > 1)

llvm/test/CodeGen/AMDGPU/amdpal.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ entry:
7070
store i32 %extra, ptr addrspace(5) %v
7171
store <2 x i32> %in, ptr addrspace(5) %v1
7272
%e = getelementptr [2 x i32], ptr addrspace(5) %v1, i32 0, i32 %idx
73-
%x = load i32, ptr addrspace(5) %e
73+
%x = load volatile i32, ptr addrspace(5) %e
7474
%xf = bitcast i32 %x to float
7575
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %xf, ptr addrspace(8) poison, i32 0, i32 0, i32 0)
7676
ret void
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s
3+
define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %idx, ptr addrspace(1) %output) #0 {
4+
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(
5+
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0:[0-9]+]] {
6+
; CHECK-NEXT: [[ENTRY:.*:]]
7+
; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison
8+
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
9+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
10+
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
11+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP1]], i32 2, i32 [[TMP2]]
12+
; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[IDX]], 2
13+
; CHECK-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]]
14+
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP5]]
15+
; CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(1) [[OUTPUT]], align 4
16+
; CHECK-NEXT: ret void
17+
;
18+
entry:
19+
%alloca = alloca [10 x <2 x i32>], align 8, addrspace(5)
20+
%row = getelementptr [10 x <2 x i32>], ptr addrspace(5) %alloca, i32 0, i32 %idx
21+
store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
22+
%elt1 = getelementptr i8, ptr addrspace(5) %row, i32 4
23+
%val = load i32, ptr addrspace(5) %elt1, align 4
24+
store i32 %val, ptr addrspace(1) %output
25+
ret void
26+
}
27+
28+
define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(i32 %idx, ptr addrspace(1) %output) #0 {
29+
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(
30+
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUTPUT:%.*]]) #[[ATTR0]] {
31+
; CHECK-NEXT: [[ENTRY:.*:]]
32+
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <16 x i32> poison
33+
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
34+
; CHECK-NEXT: [[TMP1:%.*]] = add i32 8, [[TMP0]]
35+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[ALLOCA]], i32 10, i32 [[TMP1]]
36+
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], 1
37+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP2]], i32 20, i32 [[TMP3]]
38+
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[IDX]], 2
39+
; CHECK-NEXT: [[TMP6:%.*]] = add i32 9, [[TMP5]]
40+
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP4]], i32 [[TMP6]]
41+
; CHECK-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[OUTPUT]], align 4
42+
; CHECK-NEXT: ret void
43+
;
44+
entry:
45+
%alloca = alloca [2 x [4 x <2 x i32>]], align 8, addrspace(5)
46+
%lvl1 = getelementptr inbounds [2 x [4 x <2 x i32>]], ptr addrspace(5) %alloca, i32 0, i32 1
47+
%lvl2 = getelementptr inbounds [4 x <2 x i32>], ptr addrspace(5) %lvl1, i32 0, i32 %idx
48+
store <2 x i32> <i32 10, i32 20>, ptr addrspace(5) %lvl2, align 8
49+
%byte = getelementptr inbounds i8, ptr addrspace(5) %lvl2, i32 4
50+
%val = load i32, ptr addrspace(5) %byte, align 4
51+
store i32 %val, ptr addrspace(1) %output
52+
ret void
53+
}
54+
55+
define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(i32 %idx, ptr addrspace(1) %out) #0 {
56+
; CHECK-LABEL: define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_twice_idx(
57+
; CHECK-SAME: i32 [[IDX:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
58+
; CHECK-NEXT: [[ENTRY:.*:]]
59+
; CHECK-NEXT: [[BUF:%.*]] = freeze <20 x i32> poison
60+
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
61+
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <20 x i32> [[BUF]], i32 1, i32 [[TMP0]]
62+
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 1
63+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP4]], i32 2, i32 [[TMP5]]
64+
; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[IDX]], 3
65+
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP1]]
66+
; CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[OUT]], align 4
67+
; CHECK-NEXT: ret void
68+
;
69+
entry:
70+
%alloca = alloca [10 x [2 x i32]], align 8, addrspace(5)
71+
%row = getelementptr inbounds [10 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 %idx
72+
store <2 x i32> <i32 1, i32 2>, ptr addrspace(5) %row, align 8
73+
%elt = getelementptr inbounds [2 x i32], ptr addrspace(5) %row, i32 0, i32 %idx
74+
%val = load i32, ptr addrspace(5) %elt, align 4
75+
store i32 %val, ptr addrspace(1) %out
76+
ret void
77+
}
78+
79+
attributes #0 = { "amdgpu-promote-alloca-to-vector-max-regs"="32" }

0 commit comments

Comments
 (0)