Skip to content

Commit d921bf2

Browse files
authored
[AMDGPU] Extend promotion of alloca to vectors (#127973)
* Add multi dimensional array support * Make maximum vector size tunable * Make ratio of VGPRs used for vector promotion tunable * Maximum array size now based on VGPR count (32b) instead of element count
1 parent 57a9088 commit d921bf2

11 files changed

+1017
-48
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1733,6 +1733,10 @@ The AMDGPU backend supports the following LLVM IR attributes.
17331733
"amdgpu-sgpr-hazard-mem-wait-cull-threshold" Sets the number of active SGPR hazards that must be present before
17341734
inserting a cull sequence at a memory wait.
17351735

1736+
"amdgpu-promote-alloca-to-vector-max-regs" Maximum vector size (in 32b registers) to create when promoting alloca.
1737+
1738+
"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.
1739+
17361740
================================================ ==========================================================
17371741

17381742
Calling Conventions

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

Lines changed: 94 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,19 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
6666
cl::desc("Maximum byte size to consider promote alloca to vector"),
6767
cl::init(0));
6868

69+
static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70+
"amdgpu-promote-alloca-to-vector-max-regs",
71+
cl::desc(
72+
"Maximum vector size (in 32b registers) to use when promoting alloca"),
73+
cl::init(16));
74+
75+
// Use up to 1/4 of available register budget for vectorization.
76+
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77+
static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78+
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
79+
cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80+
cl::init(4));
81+
6982
static cl::opt<unsigned>
7083
LoopUserWeight("promote-alloca-vector-loop-user-weight",
7184
cl::desc("The bonus weight of users of allocas within loop "
@@ -84,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
8497
uint32_t LocalMemLimit = 0;
8598
uint32_t CurrentLocalMemUsage = 0;
8699
unsigned MaxVGPRs;
100+
unsigned VGPRBudgetRatio;
101+
unsigned MaxVectorRegs;
87102

88103
bool IsAMDGCN = false;
89104
bool IsAMDHSA = false;
@@ -112,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
112127

113128
void sortAllocasToPromote(SmallVectorImpl<AllocaInst *> &Allocas);
114129

130+
void setFunctionLimits(const Function &F);
131+
115132
public:
116133
AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117134

@@ -298,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
298315
// clang-format on
299316
}
300317

318+
void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
319+
// Load per function limits, overriding with global options where appropriate.
320+
MaxVectorRegs = F.getFnAttributeAsParsedInteger(
321+
"amdgpu-promote-alloca-to-vector-max-regs", PromoteAllocaToVectorMaxRegs);
322+
if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
323+
MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
324+
VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
325+
"amdgpu-promote-alloca-to-vector-vgpr-ratio",
326+
PromoteAllocaToVectorVGPRRatio);
327+
if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
328+
VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
329+
}
330+
301331
bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
302332
Mod = F.getParent();
303333
DL = &Mod->getDataLayout();
@@ -307,15 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
307337
return false;
308338

309339
MaxVGPRs = getMaxVGPRs(TM, F);
340+
setFunctionLimits(F);
310341

311342
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
312343

313-
// Use up to 1/4 of available register budget for vectorization.
314-
// FIXME: Increase the limit for whole function budgets? Perhaps x2?
315344
unsigned VectorizationBudget =
316345
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317346
: (MaxVGPRs * 32)) /
318-
4;
347+
VGPRBudgetRatio;
319348

320349
SmallVector<AllocaInst *, 16> Allocas;
321350
for (Instruction &I : F.getEntryBlock()) {
@@ -400,7 +429,8 @@ static Value *calculateVectorIndex(
400429
}
401430

402431
static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
403-
Type *VecElemTy, const DataLayout &DL) {
432+
Type *VecElemTy, const DataLayout &DL,
433+
SmallVector<Instruction *> &NewInsts) {
404434
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
405435
// helper.
406436
unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
@@ -414,22 +444,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
414444
if (VarOffsets.size() > 1)
415445
return nullptr;
416446

417-
if (VarOffsets.size() == 1) {
418-
// Only handle cases where we don't need to insert extra arithmetic
419-
// instructions.
420-
const auto &VarOffset = VarOffsets.front();
421-
if (!ConstOffset.isZero() || VarOffset.second != VecElemSize)
422-
return nullptr;
423-
return VarOffset.first;
424-
}
425-
426447
APInt Quot;
427448
uint64_t Rem;
428449
APInt::udivrem(ConstOffset, VecElemSize, Quot, Rem);
429450
if (Rem != 0)
430451
return nullptr;
431452

432-
return ConstantInt::get(GEP->getContext(), Quot);
453+
ConstantInt *ConstIndex = ConstantInt::get(GEP->getContext(), Quot);
454+
if (VarOffsets.size() == 0)
455+
return ConstIndex;
456+
457+
IRBuilder<> Builder(GEP);
458+
459+
const auto &VarOffset = VarOffsets.front();
460+
APInt::udivrem(VarOffset.second, VecElemSize, Quot, Rem);
461+
if (Rem != 0 || Quot.isZero())
462+
return nullptr;
463+
464+
Value *Offset = VarOffset.first;
465+
if (!Quot.isOne()) {
466+
ConstantInt *ConstMul = ConstantInt::get(GEP->getContext(), Quot);
467+
Offset = Builder.CreateMul(Offset, ConstMul);
468+
if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
469+
NewInsts.push_back(NewInst);
470+
}
471+
if (ConstOffset.isZero())
472+
return Offset;
473+
474+
Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
475+
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
476+
NewInsts.push_back(NewInst);
477+
return IndexAdd;
433478
}
434479

435480
/// Promotes a single user of the alloca to a vector form.
@@ -737,23 +782,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
737782
Type *AllocaTy = Alloca.getAllocatedType();
738783
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
739784
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
740-
if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
741-
ArrayTy->getNumElements() > 0)
742-
VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
743-
ArrayTy->getNumElements());
785+
uint64_t NumElems = 1;
786+
Type *ElemTy;
787+
do {
788+
NumElems *= ArrayTy->getNumElements();
789+
ElemTy = ArrayTy->getElementType();
790+
} while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
791+
792+
// Check for array of vectors
793+
auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
794+
if (InnerVectorTy) {
795+
NumElems *= InnerVectorTy->getNumElements();
796+
ElemTy = InnerVectorTy->getElementType();
797+
}
798+
799+
if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
800+
unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
801+
unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
802+
// Expand vector if required to match padding of inner type,
803+
// i.e. odd size subvectors.
804+
// Storage size of new vector must match that of alloca for correct
805+
// behaviour of byte offsets and GEP computation.
806+
if (NumElems * ElementSize != AllocaSize)
807+
NumElems = AllocaSize / ElementSize;
808+
if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
809+
VectorTy = FixedVectorType::get(ElemTy, NumElems);
810+
}
744811
}
745812

746-
// FIXME: There is no reason why we can't support larger arrays, we
747-
// are just being conservative for now.
748-
// FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
749-
// equivalent. Potentially these could also be promoted but we don't currently
750-
// handle this case
751813
if (!VectorTy) {
752814
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
753815
return false;
754816
}
755817

756-
if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
818+
const unsigned MaxElements =
819+
(MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
820+
821+
if (VectorTy->getNumElements() > MaxElements ||
822+
VectorTy->getNumElements() < 2) {
757823
LLVM_DEBUG(dbgs() << " " << *VectorTy
758824
<< " has an unsupported number of elements\n");
759825
return false;
@@ -763,11 +829,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
763829
SmallVector<Instruction *> WorkList;
764830
SmallVector<Instruction *> UsersToRemove;
765831
SmallVector<Instruction *> DeferredInsts;
832+
SmallVector<Instruction *> NewGEPInsts;
766833
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
767834

768835
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
769836
LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
770837
<< " " << *Inst << "\n");
838+
for (auto *Inst : reverse(NewGEPInsts))
839+
Inst->eraseFromParent();
771840
return false;
772841
};
773842

@@ -817,7 +886,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
817886
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
818887
// If we can't compute a vector index from this GEP, then we can't
819888
// promote this alloca to vector.
820-
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
889+
Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts);
821890
if (!Index)
822891
return RejectUser(Inst, "cannot compute vector index for GEP");
823892

llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,7 @@ entry:
258258
; FUNC-LABEL: {{^}}no_overlap:
259259
;
260260
; A total of 5 bytes should be allocated and used.
261-
; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
261+
; SI-ALLOCA: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 ;
262262
define amdgpu_kernel void @no_overlap(ptr addrspace(1) %out, i32 %in) #0 {
263263
entry:
264264
%0 = alloca [3 x i8], align 1, addrspace(5)
@@ -281,6 +281,7 @@ entry:
281281
ret void
282282
}
283283

284+
; FUNC-LABEL: {{^}}char_array_array:
284285
define amdgpu_kernel void @char_array_array(ptr addrspace(1) %out, i32 %index) #0 {
285286
entry:
286287
%alloca = alloca [2 x [2 x i8]], addrspace(5)
@@ -294,6 +295,7 @@ entry:
294295
ret void
295296
}
296297

298+
; FUNC-LABEL: {{^}}i32_array_array:
297299
define amdgpu_kernel void @i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
298300
entry:
299301
%alloca = alloca [2 x [2 x i32]], addrspace(5)
@@ -306,6 +308,7 @@ entry:
306308
ret void
307309
}
308310

311+
; FUNC-LABEL: {{^}}i64_array_array:
309312
define amdgpu_kernel void @i64_array_array(ptr addrspace(1) %out, i32 %index) #0 {
310313
entry:
311314
%alloca = alloca [2 x [2 x i64]], addrspace(5)
@@ -319,7 +322,7 @@ entry:
319322
}
320323

321324
%struct.pair32 = type { i32, i32 }
322-
325+
; FUNC-LABEL: {{^}}struct_array_array:
323326
define amdgpu_kernel void @struct_array_array(ptr addrspace(1) %out, i32 %index) #0 {
324327
entry:
325328
%alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
@@ -333,6 +336,7 @@ entry:
333336
ret void
334337
}
335338

339+
; FUNC-LABEL: {{^}}struct_pair32_array:
336340
define amdgpu_kernel void @struct_pair32_array(ptr addrspace(1) %out, i32 %index) #0 {
337341
entry:
338342
%alloca = alloca [2 x %struct.pair32], addrspace(5)
@@ -346,6 +350,7 @@ entry:
346350
ret void
347351
}
348352

353+
; FUNC-LABEL: {{^}}select_private:
349354
define amdgpu_kernel void @select_private(ptr addrspace(1) %out, i32 %in) nounwind {
350355
entry:
351356
%tmp = alloca [2 x i32], addrspace(5)

llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,7 @@ declare void @llvm.amdgcn.s.barrier() #2
2222
; SI-ALLOCA: s_barrier
2323
; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64
2424
;
25-
; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
26-
; alloca to a vector. It currently fails because it does not know how
27-
; to interpret:
28-
; getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 1, i32 %b
29-
30-
; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
31-
; SI-PROMOTE: ds_write_b32 [[PTRREG]]
25+
; SI-PROMOTE: LDSByteSize: 0
3226
define amdgpu_kernel void @test_private_array_ptr_calc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %inA, ptr addrspace(1) noalias %inB) #0 {
3327
%alloca = alloca [16 x i32], align 16, addrspace(5)
3428
%mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);

0 commit comments

Comments
 (0)