@@ -66,6 +66,19 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66
66
cl::desc (" Maximum byte size to consider promote alloca to vector" ),
67
67
cl::init(0 ));
68
68
69
+ static cl::opt<unsigned > PromoteAllocaToVectorMaxRegs (
70
+ " amdgpu-promote-alloca-to-vector-max-regs" ,
71
+ cl::desc (
72
+ " Maximum vector size (in 32b registers) to use when promoting alloca" ),
73
+ cl::init(16 ));
74
+
75
+ // Use up to 1/4 of available register budget for vectorization.
76
+ // FIXME: Increase the limit for whole function budgets? Perhaps x2?
77
+ static cl::opt<unsigned > PromoteAllocaToVectorVGPRRatio (
78
+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
79
+ cl::desc (" Ratio of VGPRs to budget for promoting alloca to vectors" ),
80
+ cl::init(4 ));
81
+
69
82
static cl::opt<unsigned >
70
83
LoopUserWeight (" promote-alloca-vector-loop-user-weight" ,
71
84
cl::desc (" The bonus weight of users of allocas within loop "
@@ -84,6 +97,8 @@ class AMDGPUPromoteAllocaImpl {
84
97
uint32_t LocalMemLimit = 0 ;
85
98
uint32_t CurrentLocalMemUsage = 0 ;
86
99
unsigned MaxVGPRs;
100
+ unsigned VGPRBudgetRatio;
101
+ unsigned MaxVectorRegs;
87
102
88
103
bool IsAMDGCN = false ;
89
104
bool IsAMDHSA = false ;
@@ -112,6 +127,8 @@ class AMDGPUPromoteAllocaImpl {
112
127
113
128
void sortAllocasToPromote (SmallVectorImpl<AllocaInst *> &Allocas);
114
129
130
+ void setFunctionLimits (const Function &F);
131
+
115
132
public:
116
133
AMDGPUPromoteAllocaImpl (TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
117
134
@@ -298,6 +315,19 @@ void AMDGPUPromoteAllocaImpl::sortAllocasToPromote(
298
315
// clang-format on
299
316
}
300
317
318
+ void AMDGPUPromoteAllocaImpl::setFunctionLimits (const Function &F) {
319
+ // Load per function limits, overriding with global options where appropriate.
320
+ MaxVectorRegs = F.getFnAttributeAsParsedInteger (
321
+ " amdgpu-promote-alloca-to-vector-max-regs" , PromoteAllocaToVectorMaxRegs);
322
+ if (PromoteAllocaToVectorMaxRegs.getNumOccurrences ())
323
+ MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
324
+ VGPRBudgetRatio = F.getFnAttributeAsParsedInteger (
325
+ " amdgpu-promote-alloca-to-vector-vgpr-ratio" ,
326
+ PromoteAllocaToVectorVGPRRatio);
327
+ if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences ())
328
+ VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
329
+ }
330
+
301
331
bool AMDGPUPromoteAllocaImpl::run (Function &F, bool PromoteToLDS) {
302
332
Mod = F.getParent ();
303
333
DL = &Mod->getDataLayout ();
@@ -307,15 +337,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
307
337
return false ;
308
338
309
339
MaxVGPRs = getMaxVGPRs (TM, F);
340
+ setFunctionLimits (F);
310
341
311
342
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem (F);
312
343
313
- // Use up to 1/4 of available register budget for vectorization.
314
- // FIXME: Increase the limit for whole function budgets? Perhaps x2?
315
344
unsigned VectorizationBudget =
316
345
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
317
346
: (MaxVGPRs * 32 )) /
318
- 4 ;
347
+ VGPRBudgetRatio ;
319
348
320
349
SmallVector<AllocaInst *, 16 > Allocas;
321
350
for (Instruction &I : F.getEntryBlock ()) {
@@ -400,7 +429,8 @@ static Value *calculateVectorIndex(
400
429
}
401
430
402
431
static Value *GEPToVectorIndex (GetElementPtrInst *GEP, AllocaInst *Alloca,
403
- Type *VecElemTy, const DataLayout &DL) {
432
+ Type *VecElemTy, const DataLayout &DL,
433
+ SmallVector<Instruction *> &NewInsts) {
404
434
// TODO: Extracting a "multiple of X" from a GEP might be a useful generic
405
435
// helper.
406
436
unsigned BW = DL.getIndexTypeSizeInBits (GEP->getType ());
@@ -414,22 +444,37 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
414
444
if (VarOffsets.size () > 1 )
415
445
return nullptr ;
416
446
417
- if (VarOffsets.size () == 1 ) {
418
- // Only handle cases where we don't need to insert extra arithmetic
419
- // instructions.
420
- const auto &VarOffset = VarOffsets.front ();
421
- if (!ConstOffset.isZero () || VarOffset.second != VecElemSize)
422
- return nullptr ;
423
- return VarOffset.first ;
424
- }
425
-
426
447
APInt Quot;
427
448
uint64_t Rem;
428
449
APInt::udivrem (ConstOffset, VecElemSize, Quot, Rem);
429
450
if (Rem != 0 )
430
451
return nullptr ;
431
452
432
- return ConstantInt::get (GEP->getContext (), Quot);
453
+ ConstantInt *ConstIndex = ConstantInt::get (GEP->getContext (), Quot);
454
+ if (VarOffsets.size () == 0 )
455
+ return ConstIndex;
456
+
457
+ IRBuilder<> Builder (GEP);
458
+
459
+ const auto &VarOffset = VarOffsets.front ();
460
+ APInt::udivrem (VarOffset.second , VecElemSize, Quot, Rem);
461
+ if (Rem != 0 || Quot.isZero ())
462
+ return nullptr ;
463
+
464
+ Value *Offset = VarOffset.first ;
465
+ if (!Quot.isOne ()) {
466
+ ConstantInt *ConstMul = ConstantInt::get (GEP->getContext (), Quot);
467
+ Offset = Builder.CreateMul (Offset, ConstMul);
468
+ if (Instruction *NewInst = dyn_cast<Instruction>(Offset))
469
+ NewInsts.push_back (NewInst);
470
+ }
471
+ if (ConstOffset.isZero ())
472
+ return Offset;
473
+
474
+ Value *IndexAdd = Builder.CreateAdd (ConstIndex, Offset);
475
+ if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
476
+ NewInsts.push_back (NewInst);
477
+ return IndexAdd;
433
478
}
434
479
435
480
// / Promotes a single user of the alloca to a vector form.
@@ -737,23 +782,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
737
782
Type *AllocaTy = Alloca.getAllocatedType ();
738
783
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
739
784
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
740
- if (VectorType::isValidElementType (ArrayTy->getElementType ()) &&
741
- ArrayTy->getNumElements () > 0 )
742
- VectorTy = FixedVectorType::get (ArrayTy->getElementType (),
743
- ArrayTy->getNumElements ());
785
+ uint64_t NumElems = 1 ;
786
+ Type *ElemTy;
787
+ do {
788
+ NumElems *= ArrayTy->getNumElements ();
789
+ ElemTy = ArrayTy->getElementType ();
790
+ } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
791
+
792
+ // Check for array of vectors
793
+ auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
794
+ if (InnerVectorTy) {
795
+ NumElems *= InnerVectorTy->getNumElements ();
796
+ ElemTy = InnerVectorTy->getElementType ();
797
+ }
798
+
799
+ if (VectorType::isValidElementType (ElemTy) && NumElems > 0 ) {
800
+ unsigned ElementSize = DL->getTypeSizeInBits (ElemTy) / 8 ;
801
+ unsigned AllocaSize = DL->getTypeStoreSize (AllocaTy);
802
+ // Expand vector if required to match padding of inner type,
803
+ // i.e. odd size subvectors.
804
+ // Storage size of new vector must match that of alloca for correct
805
+ // behaviour of byte offsets and GEP computation.
806
+ if (NumElems * ElementSize != AllocaSize)
807
+ NumElems = AllocaSize / ElementSize;
808
+ if (NumElems > 0 && (AllocaSize % ElementSize) == 0 )
809
+ VectorTy = FixedVectorType::get (ElemTy, NumElems);
810
+ }
744
811
}
745
812
746
- // FIXME: There is no reason why we can't support larger arrays, we
747
- // are just being conservative for now.
748
- // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
749
- // equivalent. Potentially these could also be promoted but we don't currently
750
- // handle this case
751
813
if (!VectorTy) {
752
814
LLVM_DEBUG (dbgs () << " Cannot convert type to vector\n " );
753
815
return false ;
754
816
}
755
817
756
- if (VectorTy->getNumElements () > 16 || VectorTy->getNumElements () < 2 ) {
818
+ const unsigned MaxElements =
819
+ (MaxVectorRegs * 32 ) / DL->getTypeSizeInBits (VectorTy->getElementType ());
820
+
821
+ if (VectorTy->getNumElements () > MaxElements ||
822
+ VectorTy->getNumElements () < 2 ) {
757
823
LLVM_DEBUG (dbgs () << " " << *VectorTy
758
824
<< " has an unsupported number of elements\n " );
759
825
return false ;
@@ -763,11 +829,14 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
763
829
SmallVector<Instruction *> WorkList;
764
830
SmallVector<Instruction *> UsersToRemove;
765
831
SmallVector<Instruction *> DeferredInsts;
832
+ SmallVector<Instruction *> NewGEPInsts;
766
833
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
767
834
768
835
const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
769
836
LLVM_DEBUG (dbgs () << " Cannot promote alloca to vector: " << Msg << " \n "
770
837
<< " " << *Inst << " \n " );
838
+ for (auto *Inst : reverse (NewGEPInsts))
839
+ Inst->eraseFromParent ();
771
840
return false ;
772
841
};
773
842
@@ -817,7 +886,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
817
886
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
818
887
// If we can't compute a vector index from this GEP, then we can't
819
888
// promote this alloca to vector.
820
- Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL);
889
+ Value *Index = GEPToVectorIndex (GEP, &Alloca, VecEltTy, *DL, NewGEPInsts );
821
890
if (!Index)
822
891
return RejectUser (Inst, " cannot compute vector index for GEP" );
823
892
0 commit comments