20
20
#include " VPlanPatternMatch.h"
21
21
#include " VPlanUtils.h"
22
22
#include " VPlanVerifier.h"
23
+ #include " llvm/ADT/APInt.h"
23
24
#include " llvm/ADT/PostOrderIterator.h"
24
25
#include " llvm/ADT/STLExtras.h"
25
26
#include " llvm/ADT/SetVector.h"
29
30
#include " llvm/Analysis/VectorUtils.h"
30
31
#include " llvm/IR/Intrinsics.h"
31
32
#include " llvm/IR/PatternMatch.h"
33
+ #include " llvm/Support/Casting.h"
34
+ #include " llvm/Support/TypeSize.h"
32
35
33
36
using namespace llvm ;
34
37
@@ -1086,11 +1089,84 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
1086
1089
}
1087
1090
}
1088
1091
1089
- void VPlanTransforms::optimizeForVFAndUF (VPlan &Plan, ElementCount BestVF,
1090
- unsigned BestUF,
1091
- PredicatedScalarEvolution &PSE) {
1092
- assert (Plan.hasVF (BestVF) && " BestVF is not available in Plan" );
1093
- assert (Plan.hasUF (BestUF) && " BestUF is not available in Plan" );
1092
+ // / Optimize the width of vector induction variables in \p Plan based on a known
1093
+ // / constant Trip Count, \p BestVF and \p BestUF.
1094
+ static bool optimizeVectorInductionWidthForTCAndVFUF (VPlan &Plan,
1095
+ ElementCount BestVF,
1096
+ unsigned BestUF) {
1097
+ // Only proceed if we have not completely removed the vector region.
1098
+ if (!Plan.getVectorLoopRegion ())
1099
+ return false ;
1100
+
1101
+ if (!Plan.getTripCount ()->isLiveIn ())
1102
+ return false ;
1103
+ auto *TC = dyn_cast_if_present<ConstantInt>(
1104
+ Plan.getTripCount ()->getUnderlyingValue ());
1105
+ if (!TC || !BestVF.isFixed ())
1106
+ return false ;
1107
+
1108
+ // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1109
+ // and UF. Returns at least 8.
1110
+ auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1111
+ APInt AlignedTC =
1112
+ Align * APIntOps::RoundingUDiv (TC, APInt (TC.getBitWidth (), Align),
1113
+ APInt::Rounding::UP);
1114
+ APInt MaxVal = AlignedTC - 1 ;
1115
+ return std::max<unsigned >(PowerOf2Ceil (MaxVal.getActiveBits ()), 8 );
1116
+ };
1117
+ unsigned NewBitWidth =
1118
+ ComputeBitWidth (TC->getValue (), BestVF.getKnownMinValue () * BestUF);
1119
+
1120
+ LLVMContext &Ctx = Plan.getCanonicalIV ()->getScalarType ()->getContext ();
1121
+ auto *NewIVTy = IntegerType::get (Ctx, NewBitWidth);
1122
+
1123
+ bool MadeChange = false ;
1124
+
1125
+ VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion ()->getEntryBasicBlock ();
1126
+ for (VPRecipeBase &Phi : HeaderVPBB->phis ()) {
1127
+ auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1128
+
1129
+ // Currently only handle canonical IVs as it is trivial to replace the start
1130
+ // and stop values, and we currently only perform the optimization when the
1131
+ // IV has a single use.
1132
+ if (!WideIV || !WideIV->isCanonical () ||
1133
+ WideIV->hasMoreThanOneUniqueUser () ||
1134
+ NewIVTy == WideIV->getScalarType ())
1135
+ continue ;
1136
+
1137
+ // Currently only handle cases where the single user is a header-mask
1138
+ // comparison with the backedge-taken-count.
1139
+ using namespace VPlanPatternMatch ;
1140
+ if (!match (
1141
+ *WideIV->user_begin (),
1142
+ m_Binary<Instruction::ICmp>(
1143
+ m_Specific (WideIV),
1144
+ m_Broadcast (m_Specific (Plan.getOrCreateBackedgeTakenCount ())))))
1145
+ continue ;
1146
+
1147
+ // Update IV operands and comparison bound to use new narrower type.
1148
+ auto *NewStart = Plan.getOrAddLiveIn (ConstantInt::get (NewIVTy, 0 ));
1149
+ WideIV->setStartValue (NewStart);
1150
+ auto *NewStep = Plan.getOrAddLiveIn (ConstantInt::get (NewIVTy, 1 ));
1151
+ WideIV->setStepValue (NewStep);
1152
+
1153
+ auto *NewBTC = new VPWidenCastRecipe (
1154
+ Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount (), NewIVTy);
1155
+ Plan.getVectorPreheader ()->appendRecipe (NewBTC);
1156
+ auto *Cmp = cast<VPInstruction>(*WideIV->user_begin ());
1157
+ Cmp->setOperand (1 , NewBTC);
1158
+
1159
+ MadeChange = true ;
1160
+ }
1161
+
1162
+ return MadeChange;
1163
+ }
1164
+
1165
+ // / Try to simplify the branch condition of \p Plan. This may restrict the
1166
+ // / resulting plan to \p BestVF and \p BestUF.
1167
+ static bool simplifyBranchConditionForVFAndUF (VPlan &Plan, ElementCount BestVF,
1168
+ unsigned BestUF,
1169
+ PredicatedScalarEvolution &PSE) {
1094
1170
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion ();
1095
1171
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock ();
1096
1172
auto *Term = &ExitingVPBB->back ();
@@ -1103,7 +1179,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1103
1179
if (!match (Term, m_BranchOnCount (m_VPValue (), m_VPValue ())) &&
1104
1180
!match (Term,
1105
1181
m_BranchOnCond (m_Not (m_ActiveLaneMask (m_VPValue (), m_VPValue ())))))
1106
- return ;
1182
+ return false ;
1107
1183
1108
1184
ScalarEvolution &SE = *PSE.getSE ();
1109
1185
const SCEV *TripCount =
@@ -1114,7 +1190,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1114
1190
const SCEV *C = SE.getElementCount (TripCount->getType (), NumElements);
1115
1191
if (TripCount->isZero () ||
1116
1192
!SE.isKnownPredicate (CmpInst::ICMP_ULE, TripCount, C))
1117
- return ;
1193
+ return false ;
1118
1194
1119
1195
// The vector loop region only executes once. If possible, completely remove
1120
1196
// the region, otherwise replace the terminator controlling the latch with
@@ -1140,7 +1216,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1140
1216
1141
1217
VPBlockUtils::connectBlocks (Preheader, Header);
1142
1218
VPBlockUtils::connectBlocks (ExitingVPBB, Exit);
1143
- simplifyRecipes (Plan, *CanIVTy);
1219
+ VPlanTransforms:: simplifyRecipes (Plan, *CanIVTy);
1144
1220
} else {
1145
1221
// The vector region contains header phis for which we cannot remove the
1146
1222
// loop region yet.
@@ -1153,8 +1229,23 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1153
1229
1154
1230
Term->eraseFromParent ();
1155
1231
1156
- Plan.setVF (BestVF);
1157
- assert (Plan.getUF () == BestUF && " BestUF must match the Plan's UF" );
1232
+ return true ;
1233
+ }
1234
+
1235
+ void VPlanTransforms::optimizeForVFAndUF (VPlan &Plan, ElementCount BestVF,
1236
+ unsigned BestUF,
1237
+ PredicatedScalarEvolution &PSE) {
1238
+ assert (Plan.hasVF (BestVF) && " BestVF is not available in Plan" );
1239
+ assert (Plan.hasUF (BestUF) && " BestUF is not available in Plan" );
1240
+
1241
+ bool MadeChange =
1242
+ simplifyBranchConditionForVFAndUF (Plan, BestVF, BestUF, PSE);
1243
+ MadeChange |= optimizeVectorInductionWidthForTCAndVFUF (Plan, BestVF, BestUF);
1244
+
1245
+ if (MadeChange) {
1246
+ Plan.setVF (BestVF);
1247
+ assert (Plan.getUF () == BestUF && " BestUF must match the Plan's UF" );
1248
+ }
1158
1249
// TODO: Further simplifications are possible
1159
1250
// 1. Replace inductions with constants.
1160
1251
// 2. Replace vector loop region with VPBasicBlock.
0 commit comments