[VPlan] Update scalar induction resume values in VPlan.#110577
[VPlan] Update scalar induction resume values in VPlan.#110577
Conversation
|
@llvm/pr-subscribers-backend-systemz @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesUpdated ILV.crateInductionResumeValues to directly update the This is the first step towards modeling them completely in VPlan. Builds on top of #109975, which Patch is 351.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110577.diff 56 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 034765bee40e7b..674c2176f2a504 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
/// and the resume values can come from an additional bypass block, the \p
/// AdditionalBypass pair provides information about the bypass block and the
/// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
+ void createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ std::pair<BasicBlock *, Value *>
+ getInductionBypassValue(PHINode *OrigPhi) const {
+ return InductionBypassValues.find(OrigPhi)->second;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ /// Mapping of induction phis to their bypass values and bypass blocks.
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
+
+ VPlan &Plan;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
- CM, BFI, PSI, Checks),
+ CM, BFI, PSI, Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Check)
+ GeneratedRTChecks &Check, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Check) {}
+ EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {
+ EPI, LVL, CM, BFI, PSI, Checks, Plan) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
nullptr, Twine(Prefix) + "scalar.ph");
}
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
+static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
+ VPValue *Op) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *IRI = cast<VPIRInstruction>(&R);
+ if (&IRI->getInstruction() == P) {
+ IRI->addOperand(Op);
+ break;
+ }
+ }
+}
+
+void InnerLoopVectorizer::createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
}
}
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+ VPBasicBlock *ScalarPHVPBB = nullptr;
+ if (MiddleVPBB->getNumSuccessors() == 2) {
+ // Order is strict: first is the exit block, second is the scalar preheader.
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ } else {
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+ }
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
+ OrigPhi->getDebugLoc(), "bc.resume.val");
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ auto *ScalarLoopHeader =
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
+ EndValueFromAdditionalBypass};
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
+ LoopBypassBlocks, AdditionalBypass);
}
}
@@ -2931,10 +2952,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
}
- // Fix live-out phis not already fixed earlier.
- for (const auto &KV : Plan.getLiveOuts())
- KV.second->fixPhi(Plan, State);
-
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -7682,6 +7699,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// the second pass for the scalar loop. The induction resume values for the
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ // Create induction resume values for both widened pointer and
+ // integer/fp inductions and update the start value of the induction
+ // recipes to use the resume value.
+ PHINode *IndPhi = nullptr;
+ const InductionDescriptor *ID;
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
+ ID = &Ind->getInductionDescriptor();
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ IndPhi = WidenInd->getPHINode();
+ ID = &WidenInd->getInductionDescriptor();
+ } else
+ continue;
+
+ createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ LoopBypassBlocks);
+ }
return {LoopVectorPreHeader, nullptr};
}
@@ -8852,7 +8888,9 @@ static void addLiveOutsForFirstOrderRecurrences(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
- Plan.addLiveOut(FORPhi, ResumePhiRecipe);
+ addOperandToPhiInVPIRBasicBlock(
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor()), FORPhi,
+ ResumePhiRecipe);
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9579,7 +9617,7 @@ static bool processLoopInVPlanNativePath(
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
- VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10067,11 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
InnerLoopVectorizer Unroller(
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
+ ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10093,10 +10131,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, &LVL, &CM, BFI, PSI, Checks);
+ EPI, &LVL, &CM, BFI, PSI, Checks,
+ *BestMainPlan);
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
@@ -10105,11 +10144,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
- Checks);
+ Checks, BestEpiPlan);
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -10158,23 +10197,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
RdxDesc.getRecurrenceStartValue());
}
} else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
+ // Retrive the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop
PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
} else {
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
}
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
}
assert(ResumeV && "Must have a resume value");
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
@@ -10186,13 +10218,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
+ BasicBlock *PH = L->getLoopPreheader();
+ for (const auto &[IVPhi, _] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BB, V);
+ }
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
- PSI, Checks);
+ PSI, Checks, BestPlan);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4247d20cb0e530..64e5cde112d44e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -456,10 +456,17 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
executeRecipes(State, getIRBasicBlock());
if (getSingleSuccessor()) {
- assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
- auto *Br = State->Builder.CreateBr(getIRBasicBlock());
- Br->setOperand(0, nullptr);
- getIRBasicBlock()->getTerminator()->eraseFromParent();
+ auto *SuccVPIRBB = dyn_cast<VPIRBasicBlock>(getSingleSuccessor());
+ if (SuccVPIRBB && SuccVPIRBB->getIRBasicBlock() ==
+ getIRBasicBlock()->getSingleSuccessor()) {
+ cast<BranchInst>(getIRBasicBlock()->getTerminator())
+ ->setOperand(0, nullptr);
+ } else {
+ assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
+ auto *Br = State->Builder.CreateBr(getIRBasicBlock());
+ Br->setOperand(0, nullptr);
+ getIRBasicBlock()->getTerminator()->eraseFromParent();
+ }
}
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -843,10 +850,6 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
#endif
VPlan::~VPlan() {
- for (auto &KV : LiveOuts)
- delete KV.second;
- LiveOuts.clear();
-
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -902,6 +905,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarHeader =
+ VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
+ VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
@@ -1051,6 +1057,8 @@ void VPlan::execute(VPTransformState *State) {
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
// Generate code in the loop pre-header and body.
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -1169,12 +1177,6 @@ void VPlan::print(raw_ostream &O) const {
Block->print(O, "", SlotTracker);
}
- if (!LiveOuts.empty())
- O << "\n";
- for (const auto &KV : LiveOuts) {
- KV.second->print(O, SlotTracker);
- }
-
O << "}\n";
}
@@ -1211,11 +1213,6 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
- assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
- LiveOuts.insert({PN, new VPLiveOut(PN, V)});
-}
-
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
// Update the operands of all cloned recipes starting at NewEntry. This
@@ -1283,10 +1280,6 @@ VPlan *VPlan::duplicate() {
remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
- // Clone live-outs.
- for (const auto &[_, LO] : LiveOuts)
- NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
-
// Initialize remaining fields of cloned VPlan.
NewPlan->VFs = VFs;
NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8392aec8ad396e..41339eb701b2cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -655,48 +655,6 @@ class VPBlockBase {
virtual VPBlockBase *clone() = 0;
};
-/// A value that is used outside the VPlan. The operand of the user needs to be
-/// added to the associated phi node. The incoming block from VPlan is
-/// determined by where the VPValue is defined: if it is defined by a recipe
-/// outside a region, its parent block is used, otherwise the middle block is
-/// used.
-class VPLiveOut : public VPUser {
- PHINode *Phi;
-
-public:
- VPLiveOut(PHINode *Phi, V...
[truncated]
|
|
@llvm/pr-subscribers-backend-powerpc Author: Florian Hahn (fhahn) ChangesUpdated ILV.crateInductionResumeValues to directly update the This is the first step towards modeling them completely in VPlan. Builds on top of #109975, which Patch is 351.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110577.diff 56 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 034765bee40e7b..674c2176f2a504 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
/// and the resume values can come from an additional bypass block, the \p
/// AdditionalBypass pair provides information about the bypass block and the
/// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
+ void createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ std::pair<BasicBlock *, Value *>
+ getInductionBypassValue(PHINode *OrigPhi) const {
+ return InductionBypassValues.find(OrigPhi)->second;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ /// Mapping of induction phis to their bypass values and bypass blocks.
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
+
+ VPlan &Plan;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
- CM, BFI, PSI, Checks),
+ CM, BFI, PSI, Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Check)
+ GeneratedRTChecks &Check, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Check) {}
+ EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {
+ EPI, LVL, CM, BFI, PSI, Checks, Plan) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
nullptr, Twine(Prefix) + "scalar.ph");
}
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
+static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
+ VPValue *Op) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *IRI = cast<VPIRInstruction>(&R);
+ if (&IRI->getInstruction() == P) {
+ IRI->addOperand(Op);
+ break;
+ }
+ }
+}
+
+void InnerLoopVectorizer::createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
}
}
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+ VPBasicBlock *ScalarPHVPBB = nullptr;
+ if (MiddleVPBB->getNumSuccessors() == 2) {
+ // Order is strict: first is the exit block, second is the scalar preheader.
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ } else {
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+ }
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
+ OrigPhi->getDebugLoc(), "bc.resume.val");
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ auto *ScalarLoopHeader =
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
+ EndValueFromAdditionalBypass};
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
+ LoopBypassBlocks, AdditionalBypass);
}
}
@@ -2931,10 +2952,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
}
- // Fix live-out phis not already fixed earlier.
- for (const auto &KV : Plan.getLiveOuts())
- KV.second->fixPhi(Plan, State);
-
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -7682,6 +7699,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// the second pass for the scalar loop. The induction resume values for the
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ // Create induction resume values for both widened pointer and
+ // integer/fp inductions and update the start value of the induction
+ // recipes to use the resume value.
+ PHINode *IndPhi = nullptr;
+ const InductionDescriptor *ID;
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
+ ID = &Ind->getInductionDescriptor();
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ IndPhi = WidenInd->getPHINode();
+ ID = &WidenInd->getInductionDescriptor();
+ } else
+ continue;
+
+ createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ LoopBypassBlocks);
+ }
return {LoopVectorPreHeader, nullptr};
}
@@ -8852,7 +8888,9 @@ static void addLiveOutsForFirstOrderRecurrences(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
- Plan.addLiveOut(FORPhi, ResumePhiRecipe);
+ addOperandToPhiInVPIRBasicBlock(
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor()), FORPhi,
+ ResumePhiRecipe);
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9579,7 +9617,7 @@ static bool processLoopInVPlanNativePath(
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
- VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10067,11 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
InnerLoopVectorizer Unroller(
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
+ ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10093,10 +10131,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, &LVL, &CM, BFI, PSI, Checks);
+ EPI, &LVL, &CM, BFI, PSI, Checks,
+ *BestMainPlan);
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
@@ -10105,11 +10144,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
- Checks);
+ Checks, BestEpiPlan);
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -10158,23 +10197,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
RdxDesc.getRecurrenceStartValue());
}
} else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
+ // Retrive the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop
PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
} else {
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
}
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
}
assert(ResumeV && "Must have a resume value");
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
@@ -10186,13 +10218,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
+ BasicBlock *PH = L->getLoopPreheader();
+ for (const auto &[IVPhi, _] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BB, V);
+ }
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
- PSI, Checks);
+ PSI, Checks, BestPlan);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4247d20cb0e530..64e5cde112d44e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -456,10 +456,17 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
executeRecipes(State, getIRBasicBlock());
if (getSingleSuccessor()) {
- assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
- auto *Br = State->Builder.CreateBr(getIRBasicBlock());
- Br->setOperand(0, nullptr);
- getIRBasicBlock()->getTerminator()->eraseFromParent();
+ auto *SuccVPIRBB = dyn_cast<VPIRBasicBlock>(getSingleSuccessor());
+ if (SuccVPIRBB && SuccVPIRBB->getIRBasicBlock() ==
+ getIRBasicBlock()->getSingleSuccessor()) {
+ cast<BranchInst>(getIRBasicBlock()->getTerminator())
+ ->setOperand(0, nullptr);
+ } else {
+ assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
+ auto *Br = State->Builder.CreateBr(getIRBasicBlock());
+ Br->setOperand(0, nullptr);
+ getIRBasicBlock()->getTerminator()->eraseFromParent();
+ }
}
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -843,10 +850,6 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
#endif
VPlan::~VPlan() {
- for (auto &KV : LiveOuts)
- delete KV.second;
- LiveOuts.clear();
-
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -902,6 +905,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarHeader =
+ VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
+ VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
@@ -1051,6 +1057,8 @@ void VPlan::execute(VPTransformState *State) {
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
// Generate code in the loop pre-header and body.
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -1169,12 +1177,6 @@ void VPlan::print(raw_ostream &O) const {
Block->print(O, "", SlotTracker);
}
- if (!LiveOuts.empty())
- O << "\n";
- for (const auto &KV : LiveOuts) {
- KV.second->print(O, SlotTracker);
- }
-
O << "}\n";
}
@@ -1211,11 +1213,6 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
- assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
- LiveOuts.insert({PN, new VPLiveOut(PN, V)});
-}
-
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
// Update the operands of all cloned recipes starting at NewEntry. This
@@ -1283,10 +1280,6 @@ VPlan *VPlan::duplicate() {
remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
- // Clone live-outs.
- for (const auto &[_, LO] : LiveOuts)
- NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
-
// Initialize remaining fields of cloned VPlan.
NewPlan->VFs = VFs;
NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8392aec8ad396e..41339eb701b2cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -655,48 +655,6 @@ class VPBlockBase {
virtual VPBlockBase *clone() = 0;
};
-/// A value that is used outside the VPlan. The operand of the user needs to be
-/// added to the associated phi node. The incoming block from VPlan is
-/// determined by where the VPValue is defined: if it is defined by a recipe
-/// outside a region, its parent block is used, otherwise the middle block is
-/// used.
-class VPLiveOut : public VPUser {
- PHINode *Phi;
-
-public:
- VPLiveOut(PHINode *Phi, V...
[truncated]
|
Use SCEV to check if the minimum iteration check (TC < Step) is known to be false. This is a first step towards addressing #111098. To catch the exact case from the issue, we need to do extra work to make sure the wrap flags on the shl are preserved and used by SCEV. Note that skeleton creation will be gradually moved to VPlan and this simplification should be done as VPlan transform eventually. The current plan is to move skeleton creation to VPlan starting from parts closest to the parts already created by VPlan, starting with induction resume value creation (started with #110577), then memory and SCEV checks and finally minimum iteration checks. PR: #111310
Updated ILV.crateInductionResumeValues to directly update the VPIRInstructiosn wrapping the original phis with the created resume values. This is the first step towards modeling them completely in VPlan. Subsequent patches will move creation of the resume values completely into VPlan. Builds on top of llvm#109975, which is included in this PR.
aac0b49 to
33b3aac
Compare
|
Rebased after landing #109975, ping :) |
dc06bd9 to
c98b6d3
Compare
|
ping :) |
|
|
||
| std::pair<BasicBlock *, Value *> | ||
| getInductionBypassValue(PHINode *OrigPhi) const { | ||
| return InductionBypassValues.find(OrigPhi)->second; |
There was a problem hiding this comment.
| return InductionBypassValues.find(OrigPhi)->second; | |
| return InductionBypassValues.at(OrigPhi); |
| } else | ||
| continue; |
There was a problem hiding this comment.
| } else | |
| continue; | |
| } else { | |
| continue; | |
| } |
| PHINode *createInductionResumeValue( | ||
| /// Create a ResumePHI VPInstruction for the induction variable \p OrigPhi to | ||
| /// resume iteration count in the scalar epilogue, from where the vectorized | ||
| /// loop left off and add it the scalar preheader of the VPlan. \p Step is the |
There was a problem hiding this comment.
| /// loop left off and add it the scalar preheader of the VPlan. \p Step is the | |
| /// loop left off, and add it to the scalar preheader of VPlan. \p Step is the |
| /// provides information about the bypass block and the end value on the edge | ||
| /// from bypass to this loop. |
There was a problem hiding this comment.
| /// provides information about the bypass block and the end value on the edge | |
| /// from bypass to this loop. | |
| /// provides this additional bypass block along with the resume value coming | |
| /// from it. |
| /// end value on the edge from bypass to this loop. | ||
| PHINode *createInductionResumeValue( | ||
| /// Create a ResumePHI VPInstruction for the induction variable \p OrigPhi to | ||
| /// resume iteration count in the scalar epilogue, from where the vectorized |
There was a problem hiding this comment.
| /// resume iteration count in the scalar epilogue, from where the vectorized | |
| /// resume iteration count in the scalar epilogue from where the vectorized |
| for (VPRecipeBase &R : *ScalarLoopHeader) { | ||
| auto *IRI = cast<VPIRInstruction>(&R); | ||
| if (&IRI->getInstruction() == OrigPhi) { | ||
| IRI->addOperand(ResumePhiRecipe); |
There was a problem hiding this comment.
Worth asserting IRI has no operands before adding one?
| {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())}, | ||
| OrigPhi->getDebugLoc(), "bc.resume.val"); | ||
| auto *ScalarLoopHeader = Plan.getScalarHeader(); | ||
| for (VPRecipeBase &R : *ScalarLoopHeader) { |
There was a problem hiding this comment.
Instead of searching linearly for each induction phi in scalar loop header, have createInductionResumeValues() scan the IRI VPIRInstructions of scalar loop header which wrap inductions, passing IRI to createInductionResumeValue(), from which OrigPhi can be retrieved? More below.
| BasicBlock *PH = L->getLoopPreheader(); | ||
|
|
There was a problem hiding this comment.
| BasicBlock *PH = L->getLoopPreheader(); | |
| // SOME COMMENT | |
| BasicBlock *PH = L->getLoopPreheader(); |
There was a problem hiding this comment.
Added comment at new location, thanks!
| for (const auto &[IVPhi, _] : LVL.getInductionVars()) { | ||
| auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); | ||
| const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi); | ||
| Inc->setIncomingValueForBlock(BB, V); | ||
| } |
There was a problem hiding this comment.
Could this be taken care of by LVP.executePlan() above rather than here in LoopVectorizePass::processLoop()?
| for (VPRecipeBase &R : | ||
| Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
There was a problem hiding this comment.
Could we iterate over the IRI's of the scalar loop header instead, as suggested above?
There was a problem hiding this comment.
Unfortunately I don't think we can here, as we don't have a mapping from IR values to VPValues (other than live-outs here)
There was a problem hiding this comment.
Iterating over the IRI's of scalar header instead of the integer/fp induction header phi recipes of vector header, relieves the need for such a mapping - to find PhiR which wraps IndPhi (by searching thru the recipes of scalar header for each IndPhi), as in plural createInductionResumeValues()?
There was a problem hiding this comment.
The code here originally tried to just create resume value for wide phis, and the scalar VPIRInstructions don't have a link to the induction phi recipes. But I replaced this now as per the suggestion above.
| ; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 | ||
| ; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] | ||
| ; PRED: pred.store.if3: | ||
| ; PRED: pred.store.if2: |
There was a problem hiding this comment.
Is there a way to remove or reduce the amount of such irrelevant naming changes?
There was a problem hiding this comment.
Trying to avoid additional redundant changes.
There was a problem hiding this comment.
Unfortunately I don't think so, as the numbering depends on the exact order in which names are created that need de-duplication (there's a single counter for all names)
There was a problem hiding this comment.
OK. Just curious - are there now fewer(?) de-duplications taking place, or only a reordering of de-duplications?
There was a problem hiding this comment.
Trying to confirm this patch is effectively NFCI.
There was a problem hiding this comment.
Tried to strop as many unrelated changes from the diffs, but also added some new ones in the ones with predicated blocks to update the block variables.
| /// Mapping of induction phis to their bypass values and bypass blocks. | ||
| DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues; | ||
|
|
There was a problem hiding this comment.
This is needed only for "additional" bypasses. Is there a way to avoid storing this mapping?
There was a problem hiding this comment.
At the moment I think this is the only place it is stored, so unfortunately there's no other way to retrieve it unless storing it here (or somewhere else in ILV)
| VPIRInstruction *PhiR, PHINode *OrigPhi, const InductionDescriptor &II, | ||
| Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | ||
| VPBuilder &ScalarPHBuilder, | ||
| std::pair<BasicBlock *, Value *> AdditionalBypass) { |
There was a problem hiding this comment.
| VPIRInstruction *PhiR, PHINode *OrigPhi, const InductionDescriptor &II, | |
| Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | |
| VPBuilder &ScalarPHBuilder, | |
| std::pair<BasicBlock *, Value *> AdditionalBypass) { | |
| VPIRInstruction *PhiR, const InductionDescriptor &II, | |
| Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | |
| VPBuilder &ScalarPHBuilder, | |
| std::pair<BasicBlock *, Value *> AdditionalBypass) { | |
| auto *OrigPhi = cast<PHINode>(&PhiR->getInstruction()); |
?
There was a problem hiding this comment.
Thanks, the PHINode argument has been removed in the latest version
| // Store the bypass values here, as they need to be added to their phi nodes | ||
| // after the epilogue skeleton has been created. |
There was a problem hiding this comment.
The value for "non-additional" bypasses also need to be added after creating epilog skeleton.
This is due to ResumePhi holding only two operands (one value coming from the vectorized loop and the other value bypassing it), whereas when vectorizing the epilog loop it should have three?
Perhaps allow ResumePhi to hold a third operand, sorted "upwards", according to https://llvm.org/docs/Vectorizers.html#epilogue-vectorization?
There was a problem hiding this comment.
Sounds good, but maybe better done as follow up?
There was a problem hiding this comment.
Very well, perhaps worth leaving a TODO.
| // If we come from a bypass edge then we need to start from the original | ||
| // start value. | ||
| VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader(); | ||
| VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin()); |
There was a problem hiding this comment.
In the long-term roadmap, it may be better to update eveny ResumePhi of the scalar preheader when each bypass block is introduced, to keep this introduction atomic, keeping control-flow predecessors up-to-date with data-flow phi's.
| // Create induction resume values and ResumePhis for the inductions in the | ||
| // epilogue loop in the VPlan for the epilogue vector loop. |
There was a problem hiding this comment.
Sentence is somewhat cumbersome.
Below EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() continues to call plural createInductionResumeValues(), for resume values of the scalar loop, with additional bypass.
Here EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() now calls singular createInductionResumeValue(), repeatedly, for resume values of the epilog loop, w/o additional bypass.
Can plural createInductionResumeValues() be called here as well, w/o additional bypass? It seems to be doing exactly what's needed.
There was a problem hiding this comment.
It can, at the cost of creating more bypass values than needed; in particular we only need resume values for widened inductions, other inductions resume from the canonical one which will get created separately.
I updated it for now, but there are some tests which need to be updated with the redundant phis (which previously got removed). Thinking about it now, we might have to create them all here, as an induction being widened in the main loop may not be widened in the epilogue.
There was a problem hiding this comment.
Are there still more bypass values created than needed, leading to test changes?
| for (VPRecipeBase &R : | ||
| Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
There was a problem hiding this comment.
Iterating over the IRI's of scalar header instead of the integer/fp induction header phi recipes of vector header, relieves the need for such a mapping - to find PhiR which wraps IndPhi (by searching thru the recipes of scalar header for each IndPhi), as in plural createInductionResumeValues()?
| PHINode *InnerLoopVectorizer::createInductionResumeValue( | ||
| PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, | ||
| ArrayRef<BasicBlock *> BypassBlocks, | ||
| void InnerLoopVectorizer::createInductionResumeValue( |
There was a problem hiding this comment.
Worth renaming createInductionResumeVPValue()?
This used to create only IR Values, in particular a PHINode in scalar preheader. Now it creates a ResumePhi recipe in scalar preheader instead, while still generating IR in vector preheader/bypass.
| ResumeV = MainILV.createInductionResumeValue( | ||
| IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), | ||
| {EPI.MainLoopIterationCountCheck}); | ||
| ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); |
There was a problem hiding this comment.
Is there an existing VPValue to resume from, is it not already used as the start value of header phi induction recipes of epilog loop?
There was a problem hiding this comment.
I might be missing something, but the code here sets the start value for the header recipes in the epilogue. We could possibly get it from the ResumePhis in the main vector loop VPlan, but the VPValues would be defined in a different plan?
There was a problem hiding this comment.
Ah, right. Perhaps worth a note explaining the (missing) connection to ResumePhi recipes in main loop VPlan, which were executed and hooked up to the Phi nodes of the scalar loop.
There was a problem hiding this comment.
Something like // Hook up to the PHINode generated by a ResumePhi recipe of main loop VPlan, which feeds the scalar loop?
| ; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 | ||
| ; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] | ||
| ; PRED: pred.store.if3: | ||
| ; PRED: pred.store.if2: |
There was a problem hiding this comment.
Trying to avoid additional redundant changes.
33308d2 to
ce214f5
Compare
(responding here as the comment button disappeared...) In some cases we create an additional resume phi, but most renames are due to the resume phis now being created after generating code for the main vector loop instead of creating them upfront (and thus bumping the name counter earlier before this patch) |
ayalz
left a comment
There was a problem hiding this comment.
"AdditionalBypassValue" is inaccurate - it serves as the additional bypass value only for the primary induction, other inductions use it to build their additional bypass value. Suggest to call it "MainVectorTripCount".
| return Induction2AdditionalBypassValue.at(OrigPhi); | ||
| } | ||
|
|
||
| /// Return the additional bypass block. |
There was a problem hiding this comment.
| /// Return the additional bypass block. | |
| /// Return the additional bypass block which targets the scalar loop by skipping the epilog loop after completing the main loop. |
| PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, | ||
| ArrayRef<BasicBlock *> BypassBlocks, | ||
| std::pair<BasicBlock *, Value *> AdditionalBypass) { | ||
| void InnerLoopVectorizer::createInductionResumeVPValue( |
There was a problem hiding this comment.
This method moves from generating (and returning) an IR PHINode to generating (and recording in VPlan) a ResumePhi recipe. It should therefore migrate from ILV to LVP (perhaps extending VP[Recipe]Builder). At the moment however it still generates IR Values in addition to said recipe, by calling getOrCreateVectorTripCount() and emitTransformedIndex(). These should migrate to use VPlan::getVectorTripCount() and perhaps some form of VPDerivedIVRecipe, respectively. Would be good to leave behind a TODO?
| void InnerLoopVectorizer::createInductionResumeVPValue( | ||
| VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step, | ||
| ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder, | ||
| Value *AdditionalBypassValue) { |
There was a problem hiding this comment.
| Value *AdditionalBypassValue) { | |
| Value *MainVectorTripCount) { |
this serves as the additional bypass value for primary induction, and the basis for computing additional bypass value for other inductions.
| BasicBlock *PH = OrigLoop->getLoopPreheader(); | ||
| for (const auto &[IVPhi, _] : Legal->getInductionVars()) { | ||
| auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); | ||
| Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); |
| Value *EndValueFromAdditionalBypass = AdditionalBypassValue; | ||
| if (OrigPhi == OldInduction) { | ||
| // We know what the end value is. | ||
| EndValue = VectorTripCount; |
There was a problem hiding this comment.
| Value *EndValueFromAdditionalBypass = AdditionalBypassValue; | |
| if (OrigPhi == OldInduction) { | |
| // We know what the end value is. | |
| EndValue = VectorTripCount; | |
| Value *EndValueFromAdditionalBypass = MainVectorTripCount; | |
| // Otherwise compute them accordingly. | |
| if (OrigPhi != OldInduction) { |
| ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] | ||
| ; CHECK: pred.store.if2: | ||
| ; CHECK: pred.store.if1: | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008 | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[TMP10]], 2008 | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = udiv i16 4943, [[TMP11]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]] | ||
| ; CHECK-NEXT: store i16 0, ptr [[TMP13]], align 2 | ||
| ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] | ||
| ; CHECK: pred.store.continue3: | ||
| ; CHECK: pred.store.continue2: |
| ; CHECK-NEXT: br label [[VECTOR_BODY33:%.*]] | ||
| ; CHECK: vector.body33: | ||
| ; CHECK: vector.body35: |
| ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY60:%.*]] | ||
| ; CHECK: vec.epilog.vector.body60: | ||
| ; CHECK: vec.epilog.vector.body58: |
| ; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT22]], <i64 0, i64 1, i64 2, i64 3> | ||
| ; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT18]] | ||
| ; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT18]], <i64 0, i64 1, i64 2, i64 3> | ||
| ; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]] |
There was a problem hiding this comment.
Yep, seems like too many changes unfortunately
| ; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16 | ||
| ; CHECK-NEXT: store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16 | ||
| ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] | ||
| ; CHECK: pred.store.continue18: |
ayalz
left a comment
There was a problem hiding this comment.
This LGTM, thanks!
Adding final comments and suggestion.
| // the epilogue loop. | ||
| // Generate VPValues and ResumePhi recipes for inductions in the epilogue loop | ||
| // to resume from the main loop or bypass it, if there are any wide | ||
| // inductions. Otherwise it is we only need a resume value for the canonical |
There was a problem hiding this comment.
| // inductions. Otherwise it is we only need a resume value for the canonical | |
| // inductions. Otherwise we only need a resume value for the canonical |
| // vector epilogue preheader | ||
| // Generate a resume phi for the canonical induction of the vector epilogue | ||
| // and put it in the vector epilogue preheader, unless such a phi already | ||
| // exists there - and can be reused |
There was a problem hiding this comment.
| // exists there - and can be reused | |
| // exists there - and can be reused |
| // exists there - and can be reused | |
| // exists there - and can be reused. |
| // to be vectorized by executing the plan (potentially with a different | ||
| // factor) again shortly afterwards. | ||
| EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); | ||
| VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); |
There was a problem hiding this comment.
Perhaps worth commenting that BestEpiPlan is used only to avoid generating redundant instructions in the best main plan - if unused by BestEpiPlan.
There was a problem hiding this comment.
Updated by collecting a set of IVs that need resume values and passing it, thanks
| ; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] | ||
| ; CHECK: pred.store.if2: | ||
| ; CHECK: pred.store.if1: | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008 | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[TMP10]], 2008 | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = udiv i16 4943, [[TMP11]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]] | ||
| ; CHECK-NEXT: store i16 0, ptr [[TMP13]], align 2 | ||
| ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] | ||
| ; CHECK: pred.store.continue3: | ||
| ; CHECK: pred.store.continue2: |
| if (any_of( | ||
| EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||
| IsaPred<VPWidenIntOrFpInductionRecipe, | ||
| VPWidenPointerInductionRecipe>)) |
There was a problem hiding this comment.
Worth collecting all original induction phi nodes that get widen in the epilog, and pass that set to createInductionResumeValues(), rather than checking if the set is empty and gating this function?
Split off NFC part refactoring from #110577. This simplifies and clarifies induction resume value creation for bypass blocks.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/6111 Here is the relevant piece of the build log for the reference |
| if (!isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(&H)) | ||
| continue; | ||
| if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&H)) | ||
| WideIVs.insert(WideIV->getPHINode()); | ||
| else | ||
| WideIVs.insert(cast<PHINode>(H.getVPSingleValue()->getUnderlyingValue())); |
There was a problem hiding this comment.
Post-commit nit: early-continue seems redundant, suffice to do
if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&H))
WideIVs.insert(WideIV->getPHINode());
else if (isa<VPWidenPointerInductionRecipe>(&H))
WideIVs.insert(cast<PHINode>(H.getVPSingleValue()->getUnderlyingValue()));
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/164/builds/5274 Here is the relevant piece of the build log for the reference |
|
Reverted because of crashes in llvm-test-suite when using stage 2 clang (https://llvm-compile-time-tracker.com/show_error.php?commit=4f7f71b7bccdc38f37b82981e8fa9ceb536a7016). |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/94/builds/2683 Here is the relevant piece of the build log for the reference |
Ah unfortunate timing, I think the issue has been fixed by 1091fad (first commit that passes the build again looking at https://llvm-compile-time-tracker.com) |
Updated ILV.createInductionResumeValues to directly update the
VPIRInstructions wrapping the original phis with the created resume
values.
This is the first step towards modeling them completely in VPlan.
Subsequent patches will move creation of the resume values completely
into VPlan.
Depends on #109975.