Skip to content

Commit bf5627c

Browse files
authored
[LV] Optimize VPWidenIntOrFpInductionRecipe for known TC (#118828)
Optimize the IR generated for a VPWidenIntOrFpInductionRecipe to use the narrowest type necessary, when the trip-count of a loop is known to be constant and the only use of the recipe is the condition used by the vector loop's backedge branch.
1 parent db04c3e commit bf5627c

15 files changed

+776
-124
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1757,6 +1757,9 @@ class VPWidenInductionRecipe : public VPHeaderPHIRecipe {
17571757
VPValue *getStepValue() { return getOperand(1); }
17581758
const VPValue *getStepValue() const { return getOperand(1); }
17591759

1760+
/// Update the step value of the recipe.
1761+
void setStepValue(VPValue *V) { setOperand(1, V); }
1762+
17601763
PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); }
17611764

17621765
/// Returns the induction descriptor for the recipe.

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,12 @@ m_BranchOnCond(const Op0_t &Op0) {
263263
return m_VPInstruction<VPInstruction::BranchOnCond>(Op0);
264264
}
265265

266+
template <typename Op0_t>
267+
inline UnaryVPInstruction_match<Op0_t, VPInstruction::Broadcast>
268+
m_Broadcast(const Op0_t &Op0) {
269+
return m_VPInstruction<VPInstruction::Broadcast>(Op0);
270+
}
271+
266272
template <typename Op0_t, typename Op1_t>
267273
inline BinaryVPInstruction_match<Op0_t, Op1_t, VPInstruction::ActiveLaneMask>
268274
m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 101 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "VPlanPatternMatch.h"
2121
#include "VPlanUtils.h"
2222
#include "VPlanVerifier.h"
23+
#include "llvm/ADT/APInt.h"
2324
#include "llvm/ADT/PostOrderIterator.h"
2425
#include "llvm/ADT/STLExtras.h"
2526
#include "llvm/ADT/SetVector.h"
@@ -29,6 +30,8 @@
2930
#include "llvm/Analysis/VectorUtils.h"
3031
#include "llvm/IR/Intrinsics.h"
3132
#include "llvm/IR/PatternMatch.h"
33+
#include "llvm/Support/Casting.h"
34+
#include "llvm/Support/TypeSize.h"
3235

3336
using namespace llvm;
3437

@@ -1086,11 +1089,84 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
10861089
}
10871090
}
10881091

1089-
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1090-
unsigned BestUF,
1091-
PredicatedScalarEvolution &PSE) {
1092-
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1093-
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1092+
/// Optimize the width of vector induction variables in \p Plan based on a known
1093+
/// constant Trip Count, \p BestVF and \p BestUF.
1094+
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
1095+
ElementCount BestVF,
1096+
unsigned BestUF) {
1097+
// Only proceed if we have not completely removed the vector region.
1098+
if (!Plan.getVectorLoopRegion())
1099+
return false;
1100+
1101+
if (!Plan.getTripCount()->isLiveIn())
1102+
return false;
1103+
auto *TC = dyn_cast_if_present<ConstantInt>(
1104+
Plan.getTripCount()->getUnderlyingValue());
1105+
if (!TC || !BestVF.isFixed())
1106+
return false;
1107+
1108+
// Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1109+
// and UF. Returns at least 8.
1110+
auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1111+
APInt AlignedTC =
1112+
Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align),
1113+
APInt::Rounding::UP);
1114+
APInt MaxVal = AlignedTC - 1;
1115+
return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1116+
};
1117+
unsigned NewBitWidth =
1118+
ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF);
1119+
1120+
LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext();
1121+
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1122+
1123+
bool MadeChange = false;
1124+
1125+
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1126+
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1127+
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1128+
1129+
// Currently only handle canonical IVs as it is trivial to replace the start
1130+
// and stop values, and we currently only perform the optimization when the
1131+
// IV has a single use.
1132+
if (!WideIV || !WideIV->isCanonical() ||
1133+
WideIV->hasMoreThanOneUniqueUser() ||
1134+
NewIVTy == WideIV->getScalarType())
1135+
continue;
1136+
1137+
// Currently only handle cases where the single user is a header-mask
1138+
// comparison with the backedge-taken-count.
1139+
using namespace VPlanPatternMatch;
1140+
if (!match(
1141+
*WideIV->user_begin(),
1142+
m_Binary<Instruction::ICmp>(
1143+
m_Specific(WideIV),
1144+
m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount())))))
1145+
continue;
1146+
1147+
// Update IV operands and comparison bound to use new narrower type.
1148+
auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
1149+
WideIV->setStartValue(NewStart);
1150+
auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
1151+
WideIV->setStepValue(NewStep);
1152+
1153+
auto *NewBTC = new VPWidenCastRecipe(
1154+
Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1155+
Plan.getVectorPreheader()->appendRecipe(NewBTC);
1156+
auto *Cmp = cast<VPInstruction>(*WideIV->user_begin());
1157+
Cmp->setOperand(1, NewBTC);
1158+
1159+
MadeChange = true;
1160+
}
1161+
1162+
return MadeChange;
1163+
}
1164+
1165+
/// Try to simplify the branch condition of \p Plan. This may restrict the
1166+
/// resulting plan to \p BestVF and \p BestUF.
1167+
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
1168+
unsigned BestUF,
1169+
PredicatedScalarEvolution &PSE) {
10941170
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
10951171
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
10961172
auto *Term = &ExitingVPBB->back();
@@ -1103,7 +1179,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11031179
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) &&
11041180
!match(Term,
11051181
m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue())))))
1106-
return;
1182+
return false;
11071183

11081184
ScalarEvolution &SE = *PSE.getSE();
11091185
const SCEV *TripCount =
@@ -1114,7 +1190,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11141190
const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements);
11151191
if (TripCount->isZero() ||
11161192
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C))
1117-
return;
1193+
return false;
11181194

11191195
// The vector loop region only executes once. If possible, completely remove
11201196
// the region, otherwise replace the terminator controlling the latch with
@@ -1140,7 +1216,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11401216

11411217
VPBlockUtils::connectBlocks(Preheader, Header);
11421218
VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
1143-
simplifyRecipes(Plan, *CanIVTy);
1219+
VPlanTransforms::simplifyRecipes(Plan, *CanIVTy);
11441220
} else {
11451221
// The vector region contains header phis for which we cannot remove the
11461222
// loop region yet.
@@ -1153,8 +1229,23 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
11531229

11541230
Term->eraseFromParent();
11551231

1156-
Plan.setVF(BestVF);
1157-
assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1232+
return true;
1233+
}
1234+
1235+
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF,
1236+
unsigned BestUF,
1237+
PredicatedScalarEvolution &PSE) {
1238+
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
1239+
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
1240+
1241+
bool MadeChange =
1242+
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
1243+
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
1244+
1245+
if (MadeChange) {
1246+
Plan.setVF(BestVF);
1247+
assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
1248+
}
11581249
// TODO: Further simplifications are possible
11591250
// 1. Replace inductions with constants.
11601251
// 2. Replace vector loop region with VPBasicBlock.

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,8 @@ define void @latch_branch_cost(ptr %dst) {
386386
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
387387
; PRED: [[VECTOR_BODY]]:
388388
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
389-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
390-
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 99)
389+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
390+
; PRED-NEXT: [[TMP0:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 99)
391391
; PRED-NEXT: [[TMP1:%.*]] = extractelement <8 x i1> [[TMP0]], i32 0
392392
; PRED-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
393393
; PRED: [[PRED_STORE_IF]]:
@@ -453,7 +453,7 @@ define void @latch_branch_cost(ptr %dst) {
453453
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]]
454454
; PRED: [[PRED_STORE_CONTINUE14]]:
455455
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
456-
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
456+
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
457457
; PRED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 104
458458
; PRED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
459459
; PRED: [[MIDDLE_BLOCK]]:
@@ -790,9 +790,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
790790
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
791791
; DEFAULT: [[VECTOR_BODY]]:
792792
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
793-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
793+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
794794
; DEFAULT-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
795-
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
795+
; DEFAULT-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
796796
; DEFAULT-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
797797
; DEFAULT-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
798798
; DEFAULT: [[PRED_STORE_IF]]:
@@ -865,7 +865,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
865865
; DEFAULT-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
866866
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE14]]
867867
; DEFAULT: [[PRED_STORE_CONTINUE14]]:
868-
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
868+
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
869869
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
870870
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
871871
; DEFAULT: [[MIDDLE_BLOCK]]:
@@ -892,9 +892,9 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
892892
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
893893
; PRED: [[VECTOR_BODY]]:
894894
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
895-
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
895+
; PRED-NEXT: [[VEC_IND:%.*]] = phi <8 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE14]] ]
896896
; PRED-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i8
897-
; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IND]], splat (i64 6)
897+
; PRED-NEXT: [[TMP1:%.*]] = icmp ule <8 x i8> [[VEC_IND]], splat (i8 6)
898898
; PRED-NEXT: [[TMP2:%.*]] = extractelement <8 x i1> [[TMP1]], i32 0
899899
; PRED-NEXT: br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
900900
; PRED: [[PRED_STORE_IF]]:
@@ -967,7 +967,7 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
967967
; PRED-NEXT: store i8 [[TMP33]], ptr [[TMP32]], align 1
968968
; PRED-NEXT: br label %[[PRED_STORE_CONTINUE14]]
969969
; PRED: [[PRED_STORE_CONTINUE14]]:
970-
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
970+
; PRED-NEXT: [[VEC_IND_NEXT]] = add <8 x i8> [[VEC_IND]], splat (i8 8)
971971
; PRED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
972972
; PRED-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
973973
; PRED: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
244244
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
245245
; DEFAULT: [[VECTOR_BODY]]:
246246
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ]
247-
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
247+
; DEFAULT-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
248248
; DEFAULT-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ]
249-
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14)
249+
; DEFAULT-NEXT: [[TMP0:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
250250
; DEFAULT-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
251251
; DEFAULT-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
252252
; DEFAULT-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]]
@@ -398,7 +398,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
398398
; DEFAULT-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1
399399
; DEFAULT-NEXT: br label %[[PRED_STORE_CONTINUE36]]
400400
; DEFAULT: [[PRED_STORE_CONTINUE36]]:
401-
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
401+
; DEFAULT-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
402402
; DEFAULT-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
403403
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
404404
; DEFAULT-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -439,9 +439,9 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
439439
; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]]
440440
; OPTSIZE: [[VECTOR_BODY]]:
441441
; OPTSIZE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE36:.*]] ]
442-
; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
442+
; OPTSIZE-NEXT: [[VEC_IND:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE36]] ]
443443
; OPTSIZE-NEXT: [[VEC_IND1:%.*]] = phi <16 x i8> [ <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], %[[PRED_STORE_CONTINUE36]] ]
444-
; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i64> [[VEC_IND]], splat (i64 14)
444+
; OPTSIZE-NEXT: [[TMP72:%.*]] = icmp ule <16 x i8> [[VEC_IND]], splat (i8 14)
445445
; OPTSIZE-NEXT: [[TMP1:%.*]] = mul <16 x i8> [[BROADCAST_SPLAT]], [[VEC_IND1]]
446446
; OPTSIZE-NEXT: [[TMP2:%.*]] = lshr <16 x i8> [[VEC_IND1]], splat (i8 1)
447447
; OPTSIZE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP2]], [[BROADCAST_SPLAT4]]
@@ -593,7 +593,7 @@ define void @tail_predicate_without_optsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 %n)
593593
; OPTSIZE-NEXT: store i8 [[TMP71]], ptr [[TMP70]], align 1
594594
; OPTSIZE-NEXT: br label %[[PRED_STORE_CONTINUE36]]
595595
; OPTSIZE: [[PRED_STORE_CONTINUE36]]:
596-
; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
596+
; OPTSIZE-NEXT: [[VEC_IND_NEXT]] = add <16 x i8> [[VEC_IND]], splat (i8 16)
597597
; OPTSIZE-NEXT: [[VEC_IND_NEXT2]] = add <16 x i8> [[VEC_IND1]], splat (i8 16)
598598
; OPTSIZE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
599599
; OPTSIZE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]

llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
5151
; VF4-NEXT: br label %[[VECTOR_BODY:.*]]
5252
; VF4: [[VECTOR_BODY]]:
5353
; VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
54-
; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
55-
; VF4-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], splat (i64 1)
54+
; VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
55+
; VF4-NEXT: [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 1)
5656
; VF4-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
5757
; VF4-NEXT: br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
5858
; VF4: [[PRED_STORE_IF]]:
@@ -109,7 +109,7 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
109109
; VF4-NEXT: store i64 [[TMP32]], ptr [[TMP31]], align 8
110110
; VF4-NEXT: br label %[[PRED_STORE_CONTINUE6]]
111111
; VF4: [[PRED_STORE_CONTINUE6]]:
112-
; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
112+
; VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
113113
; VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
114114
; VF4-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
115115
; VF4: [[MIDDLE_BLOCK]]:

0 commit comments

Comments
 (0)