-
Notifications
You must be signed in to change notification settings - Fork 12.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Update scalar induction resume values in VPlan. #110577
Conversation
@llvm/pr-subscribers-backend-systemz @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesUpdated ILV.crateInductionResumeValues to directly update the This is the first step towards modeling them completely in VPlan. Builds on top of #109975, which Patch is 351.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110577.diff 56 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 034765bee40e7b..674c2176f2a504 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
/// and the resume values can come from an additional bypass block, the \p
/// AdditionalBypass pair provides information about the bypass block and the
/// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
+ void createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ std::pair<BasicBlock *, Value *>
+ getInductionBypassValue(PHINode *OrigPhi) const {
+ return InductionBypassValues.find(OrigPhi)->second;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ /// Mapping of induction phis to their bypass values and bypass blocks.
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
+
+ VPlan &Plan;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
- CM, BFI, PSI, Checks),
+ CM, BFI, PSI, Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Check)
+ GeneratedRTChecks &Check, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Check) {}
+ EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {
+ EPI, LVL, CM, BFI, PSI, Checks, Plan) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
nullptr, Twine(Prefix) + "scalar.ph");
}
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
+static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
+ VPValue *Op) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *IRI = cast<VPIRInstruction>(&R);
+ if (&IRI->getInstruction() == P) {
+ IRI->addOperand(Op);
+ break;
+ }
+ }
+}
+
+void InnerLoopVectorizer::createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
}
}
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+ VPBasicBlock *ScalarPHVPBB = nullptr;
+ if (MiddleVPBB->getNumSuccessors() == 2) {
+ // Order is strict: first is the exit block, second is the scalar preheader.
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ } else {
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+ }
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
+ OrigPhi->getDebugLoc(), "bc.resume.val");
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ auto *ScalarLoopHeader =
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
+ EndValueFromAdditionalBypass};
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
+ LoopBypassBlocks, AdditionalBypass);
}
}
@@ -2931,10 +2952,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
}
- // Fix live-out phis not already fixed earlier.
- for (const auto &KV : Plan.getLiveOuts())
- KV.second->fixPhi(Plan, State);
-
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -7682,6 +7699,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// the second pass for the scalar loop. The induction resume values for the
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ // Create induction resume values for both widened pointer and
+ // integer/fp inductions and update the start value of the induction
+ // recipes to use the resume value.
+ PHINode *IndPhi = nullptr;
+ const InductionDescriptor *ID;
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
+ ID = &Ind->getInductionDescriptor();
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ IndPhi = WidenInd->getPHINode();
+ ID = &WidenInd->getInductionDescriptor();
+ } else
+ continue;
+
+ createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ LoopBypassBlocks);
+ }
return {LoopVectorPreHeader, nullptr};
}
@@ -8852,7 +8888,9 @@ static void addLiveOutsForFirstOrderRecurrences(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
- Plan.addLiveOut(FORPhi, ResumePhiRecipe);
+ addOperandToPhiInVPIRBasicBlock(
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor()), FORPhi,
+ ResumePhiRecipe);
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9579,7 +9617,7 @@ static bool processLoopInVPlanNativePath(
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
- VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10067,11 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
InnerLoopVectorizer Unroller(
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
+ ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10093,10 +10131,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, &LVL, &CM, BFI, PSI, Checks);
+ EPI, &LVL, &CM, BFI, PSI, Checks,
+ *BestMainPlan);
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
@@ -10105,11 +10144,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
- Checks);
+ Checks, BestEpiPlan);
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -10158,23 +10197,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
RdxDesc.getRecurrenceStartValue());
}
} else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
+ // Retrive the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop
PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
} else {
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
}
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
}
assert(ResumeV && "Must have a resume value");
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
@@ -10186,13 +10218,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
+ BasicBlock *PH = L->getLoopPreheader();
+ for (const auto &[IVPhi, _] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BB, V);
+ }
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
- PSI, Checks);
+ PSI, Checks, BestPlan);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4247d20cb0e530..64e5cde112d44e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -456,10 +456,17 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
executeRecipes(State, getIRBasicBlock());
if (getSingleSuccessor()) {
- assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
- auto *Br = State->Builder.CreateBr(getIRBasicBlock());
- Br->setOperand(0, nullptr);
- getIRBasicBlock()->getTerminator()->eraseFromParent();
+ auto *SuccVPIRBB = dyn_cast<VPIRBasicBlock>(getSingleSuccessor());
+ if (SuccVPIRBB && SuccVPIRBB->getIRBasicBlock() ==
+ getIRBasicBlock()->getSingleSuccessor()) {
+ cast<BranchInst>(getIRBasicBlock()->getTerminator())
+ ->setOperand(0, nullptr);
+ } else {
+ assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
+ auto *Br = State->Builder.CreateBr(getIRBasicBlock());
+ Br->setOperand(0, nullptr);
+ getIRBasicBlock()->getTerminator()->eraseFromParent();
+ }
}
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -843,10 +850,6 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
#endif
VPlan::~VPlan() {
- for (auto &KV : LiveOuts)
- delete KV.second;
- LiveOuts.clear();
-
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -902,6 +905,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarHeader =
+ VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
+ VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
@@ -1051,6 +1057,8 @@ void VPlan::execute(VPTransformState *State) {
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
// Generate code in the loop pre-header and body.
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -1169,12 +1177,6 @@ void VPlan::print(raw_ostream &O) const {
Block->print(O, "", SlotTracker);
}
- if (!LiveOuts.empty())
- O << "\n";
- for (const auto &KV : LiveOuts) {
- KV.second->print(O, SlotTracker);
- }
-
O << "}\n";
}
@@ -1211,11 +1213,6 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
- assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
- LiveOuts.insert({PN, new VPLiveOut(PN, V)});
-}
-
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
// Update the operands of all cloned recipes starting at NewEntry. This
@@ -1283,10 +1280,6 @@ VPlan *VPlan::duplicate() {
remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
- // Clone live-outs.
- for (const auto &[_, LO] : LiveOuts)
- NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
-
// Initialize remaining fields of cloned VPlan.
NewPlan->VFs = VFs;
NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8392aec8ad396e..41339eb701b2cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -655,48 +655,6 @@ class VPBlockBase {
virtual VPBlockBase *clone() = 0;
};
-/// A value that is used outside the VPlan. The operand of the user needs to be
-/// added to the associated phi node. The incoming block from VPlan is
-/// determined by where the VPValue is defined: if it is defined by a recipe
-/// outside a region, its parent block is used, otherwise the middle block is
-/// used.
-class VPLiveOut : public VPUser {
- PHINode *Phi;
-
-public:
- VPLiveOut(PHINode *Phi, V...
[truncated]
|
@llvm/pr-subscribers-backend-powerpc Author: Florian Hahn (fhahn) ChangesUpdated ILV.crateInductionResumeValues to directly update the This is the first step towards modeling them completely in VPlan. Builds on top of #109975, which Patch is 351.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/110577.diff 56 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 034765bee40e7b..674c2176f2a504 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
ElementCount MinProfitableTripCount,
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
+ VPlan &Plan)
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
- PSI(PSI), RTChecks(RTChecks) {
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
// Query this against the original loop and save it here because the profile
// of the original loop header may change as the transformation happens.
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
/// and the resume values can come from an additional bypass block, the \p
/// AdditionalBypass pair provides information about the bypass block and the
/// end value on the edge from bypass to this loop.
- PHINode *createInductionResumeValue(
+ void createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
/// count of the original loop for both main loop and epilogue vectorization.
void setTripCount(Value *TC) { TripCount = TC; }
+ std::pair<BasicBlock *, Value *>
+ getInductionBypassValue(PHINode *OrigPhi) const {
+ return InductionBypassValues.find(OrigPhi)->second;
+ }
+
protected:
friend class LoopVectorizationPlanner;
@@ -680,6 +686,11 @@ class InnerLoopVectorizer {
/// Structure to hold information about generated runtime checks, responsible
/// for cleaning the checks, if vectorization turns out unprofitable.
GeneratedRTChecks &RTChecks;
+
+ /// Mapping of induction phis to their bypass values and bypass blocks.
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
+
+ VPlan &Plan;
};
/// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -721,10 +732,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
- CM, BFI, PSI, Checks),
+ CM, BFI, PSI, Checks, Plan),
EPI(EPI) {}
// Override this function to handle the more complex control flow around the
@@ -761,9 +772,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Check)
+ GeneratedRTChecks &Check, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Check) {}
+ EPI, LVL, CM, BFI, PSI, Check, Plan) {}
/// Implements the interface for creating a vectorized skeleton using the
/// *main loop* strategy (ie the first pass of vplan execution).
std::pair<BasicBlock *, Value *>
@@ -790,9 +801,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- GeneratedRTChecks &Checks)
+ GeneratedRTChecks &Checks, VPlan &Plan)
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, LVL, CM, BFI, PSI, Checks) {
+ EPI, LVL, CM, BFI, PSI, Checks, Plan) {
TripCount = EPI.TripCount;
}
/// Implements the interface for creating a vectorized skeleton using the
@@ -2555,7 +2566,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
nullptr, Twine(Prefix) + "scalar.ph");
}
-PHINode *InnerLoopVectorizer::createInductionResumeValue(
+static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
+ VPValue *Op) {
+ for (VPRecipeBase &R : *VPBB) {
+ auto *IRI = cast<VPIRInstruction>(&R);
+ if (&IRI->getInstruction() == P) {
+ IRI->addOperand(Op);
+ break;
+ }
+ }
+}
+
+void InnerLoopVectorizer::createInductionResumeValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2590,27 +2612,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
}
}
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getFirstNonPHIIt());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ VPBasicBlock *MiddleVPBB =
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+ VPBasicBlock *ScalarPHVPBB = nullptr;
+ if (MiddleVPBB->getNumSuccessors() == 2) {
+ // Order is strict: first is the exit block, second is the scalar preheader.
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
+ } else {
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
+ }
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : BypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ VPBuilder ScalarPHBuilder(ScalarPHVPBB);
+ auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
+ VPInstruction::ResumePhi,
+ {Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
+ OrigPhi->getDebugLoc(), "bc.resume.val");
- if (AdditionalBypass.first)
- BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
- EndValueFromAdditionalBypass);
- return BCResumeVal;
+ auto *ScalarLoopHeader =
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
+ addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
+ EndValueFromAdditionalBypass};
}
/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2643,10 +2666,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
- PHINode *BCResumeVal = createInductionResumeValue(
- OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
- AdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
+ LoopBypassBlocks, AdditionalBypass);
}
}
@@ -2931,10 +2952,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
IVEndValues[Entry.first], LoopMiddleBlock, Plan, State);
}
- // Fix live-out phis not already fixed earlier.
- for (const auto &KV : Plan.getLiveOuts())
- KV.second->fixPhi(Plan, State);
-
for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);
@@ -7682,6 +7699,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
// the second pass for the scalar loop. The induction resume values for the
// inductions in the epilogue loop are created before executing the plan for
// the epilogue loop.
+ for (VPRecipeBase &R :
+ Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
+ // Create induction resume values for both widened pointer and
+ // integer/fp inductions and update the start value of the induction
+ // recipes to use the resume value.
+ PHINode *IndPhi = nullptr;
+ const InductionDescriptor *ID;
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
+ ID = &Ind->getInductionDescriptor();
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+ IndPhi = WidenInd->getPHINode();
+ ID = &WidenInd->getInductionDescriptor();
+ } else
+ continue;
+
+ createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
+ LoopBypassBlocks);
+ }
return {LoopVectorPreHeader, nullptr};
}
@@ -8852,7 +8888,9 @@ static void addLiveOutsForFirstOrderRecurrences(
VPInstruction::ResumePhi, {Resume, FOR->getStartValue()}, {},
"scalar.recur.init");
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr());
- Plan.addLiveOut(FORPhi, ResumePhiRecipe);
+ addOperandToPhiInVPIRBasicBlock(
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor()), FORPhi,
+ ResumePhiRecipe);
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9579,7 +9617,7 @@ static bool processLoopInVPlanNativePath(
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
F->getDataLayout(), AddBranchWeights);
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
- VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+ VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
@@ -10067,11 +10105,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
assert(IC > 1 && "interleave count should not be 1 or 0");
// If we decided that it is not legal to vectorize the loop, then
// interleave it.
+ VPlan &BestPlan = LVP.getPlanFor(VF.Width);
InnerLoopVectorizer Unroller(
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1),
- ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks);
+ ElementCount::getFixed(1), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan);
- VPlan &BestPlan = LVP.getPlanFor(VF.Width);
LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
@@ -10093,10 +10131,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+ std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
- EPI, &LVL, &CM, BFI, PSI, Checks);
+ EPI, &LVL, &CM, BFI, PSI, Checks,
+ *BestMainPlan);
- std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
@@ -10105,11 +10144,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
+ VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
- Checks);
+ Checks, BestEpiPlan);
- VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
Header->setName("vec.epilog.vector.body");
@@ -10158,23 +10197,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
RdxDesc.getRecurrenceStartValue());
}
} else {
- // Create induction resume values for both widened pointer and
- // integer/fp inductions and update the start value of the induction
- // recipes to use the resume value.
+ // Retrive the induction resume values for wide inductions from
+ // their original phi nodes in the scalar loop
PHINode *IndPhi = nullptr;
- const InductionDescriptor *ID;
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
IndPhi = cast<PHINode>(Ind->getUnderlyingValue());
- ID = &Ind->getInductionDescriptor();
} else {
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
IndPhi = WidenInd->getPHINode();
- ID = &WidenInd->getInductionDescriptor();
}
-
- ResumeV = MainILV.createInductionResumeValue(
- IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
- {EPI.MainLoopIterationCountCheck});
+ ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());
}
assert(ResumeV && "Must have a resume value");
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);
@@ -10186,13 +10218,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
DT, true, &ExpandedSCEVs);
++LoopsEpilogueVectorized;
+ BasicBlock *PH = L->getLoopPreheader();
+ for (const auto &[IVPhi, _] : LVL.getInductionVars()) {
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi);
+ Inc->setIncomingValueForBlock(BB, V);
+ }
if (!MainILV.areSafetyChecksAdded())
DisableRuntimeUnroll = true;
} else {
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
- PSI, Checks);
+ PSI, Checks, BestPlan);
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4247d20cb0e530..64e5cde112d44e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -456,10 +456,17 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
executeRecipes(State, getIRBasicBlock());
if (getSingleSuccessor()) {
- assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
- auto *Br = State->Builder.CreateBr(getIRBasicBlock());
- Br->setOperand(0, nullptr);
- getIRBasicBlock()->getTerminator()->eraseFromParent();
+ auto *SuccVPIRBB = dyn_cast<VPIRBasicBlock>(getSingleSuccessor());
+ if (SuccVPIRBB && SuccVPIRBB->getIRBasicBlock() ==
+ getIRBasicBlock()->getSingleSuccessor()) {
+ cast<BranchInst>(getIRBasicBlock()->getTerminator())
+ ->setOperand(0, nullptr);
+ } else {
+ assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
+ auto *Br = State->Builder.CreateBr(getIRBasicBlock());
+ Br->setOperand(0, nullptr);
+ getIRBasicBlock()->getTerminator()->eraseFromParent();
+ }
}
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
@@ -843,10 +850,6 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent,
#endif
VPlan::~VPlan() {
- for (auto &KV : LiveOuts)
- delete KV.second;
- LiveOuts.clear();
-
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -902,6 +905,9 @@ VPlanPtr VPlan::createInitialVPlan(Type *InductionTy,
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
VPBasicBlock *ScalarPH = new VPBasicBlock("scalar.ph");
+ VPBasicBlock *ScalarHeader =
+ VPIRBasicBlock::fromBasicBlock(TheLoop->getHeader());
+ VPBlockUtils::connectBlocks(ScalarPH, ScalarHeader);
if (!RequiresScalarEpilogueCheck) {
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
return Plan;
@@ -1051,6 +1057,8 @@ void VPlan::execute(VPTransformState *State) {
BrInst->insertBefore(MiddleBB->getTerminator());
MiddleBB->getTerminator()->eraseFromParent();
State->CFG.DTU.applyUpdates({{DominatorTree::Delete, MiddleBB, ScalarPh}});
+ State->CFG.DTU.applyUpdates(
+ {{DominatorTree::Delete, ScalarPh, ScalarPh->getSingleSuccessor()}});
// Generate code in the loop pre-header and body.
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
@@ -1169,12 +1177,6 @@ void VPlan::print(raw_ostream &O) const {
Block->print(O, "", SlotTracker);
}
- if (!LiveOuts.empty())
- O << "\n";
- for (const auto &KV : LiveOuts) {
- KV.second->print(O, SlotTracker);
- }
-
O << "}\n";
}
@@ -1211,11 +1213,6 @@ LLVM_DUMP_METHOD
void VPlan::dump() const { print(dbgs()); }
#endif
-void VPlan::addLiveOut(PHINode *PN, VPValue *V) {
- assert(LiveOuts.count(PN) == 0 && "an exit value for PN already exists");
- LiveOuts.insert({PN, new VPLiveOut(PN, V)});
-}
-
static void remapOperands(VPBlockBase *Entry, VPBlockBase *NewEntry,
DenseMap<VPValue *, VPValue *> &Old2NewVPValues) {
// Update the operands of all cloned recipes starting at NewEntry. This
@@ -1283,10 +1280,6 @@ VPlan *VPlan::duplicate() {
remapOperands(Preheader, NewPreheader, Old2NewVPValues);
remapOperands(Entry, NewEntry, Old2NewVPValues);
- // Clone live-outs.
- for (const auto &[_, LO] : LiveOuts)
- NewPlan->addLiveOut(LO->getPhi(), Old2NewVPValues[LO->getOperand(0)]);
-
// Initialize remaining fields of cloned VPlan.
NewPlan->VFs = VFs;
NewPlan->UFs = UFs;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8392aec8ad396e..41339eb701b2cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -655,48 +655,6 @@ class VPBlockBase {
virtual VPBlockBase *clone() = 0;
};
-/// A value that is used outside the VPlan. The operand of the user needs to be
-/// added to the associated phi node. The incoming block from VPlan is
-/// determined by where the VPValue is defined: if it is defined by a recipe
-/// outside a region, its parent block is used, otherwise the middle block is
-/// used.
-class VPLiveOut : public VPUser {
- PHINode *Phi;
-
-public:
- VPLiveOut(PHINode *Phi, V...
[truncated]
|
Use SCEV to check if the minimum iteration check (TC < Step) is known to be false. This is a first step towards addressing #111098. To catch the exact case from the issue, we need to do extra work to make sure the wrap flags on the shl are preserved and used by SCEV. Note that skeleton creation will be gradually moved to VPlan and this simplification should be done as VPlan transform eventually. The current plan is to move skeleton creation to VPlan starting from parts closest to the parts already created by VPlan, starting with induction resume value creation (started with #110577), then memory and SCEV checks and finally minimum iteration checks. PR: #111310
) Use SCEV to check if the minimum iteration check (TC < Step) is known to be false. This is a first step towards addressing llvm#111098. To catch the exact case from the issue, we need to do extra work to make sure the wrap flags on the shl are preserved and used by SCEV. Note that skeleton creation will be gradually moved to VPlan and this simplification should be done as VPlan transform eventually. The current plan is to move skeleton creation to VPlan starting from parts closest to the parts already created by VPlan, starting with induction resume value creation (started with llvm#110577), then memory and SCEV checks and finally minimum iteration checks. PR: llvm#111310
Updated ILV.crateInductionResumeValues to directly update the VPIRInstructiosn wrapping the original phis with the created resume values. This is the first step towards modeling them completely in VPlan. Subsequent patches will move creation of the resume values completely into VPlan. Builds on top of llvm#109975, which is included in this PR.
aac0b49
to
33b3aac
Compare
Rebased after landing #109975, ping :) |
dc06bd9
to
c98b6d3
Compare
ping :) |
@@ -532,6 +533,11 @@ class InnerLoopVectorizer { | |||
/// count of the original loop for both main loop and epilogue vectorization. | |||
void setTripCount(Value *TC) { TripCount = TC; } | |||
|
|||
std::pair<BasicBlock *, Value *> | |||
getInductionBypassValue(PHINode *OrigPhi) const { | |||
return InductionBypassValues.find(OrigPhi)->second; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return InductionBypassValues.find(OrigPhi)->second; | |
return InductionBypassValues.at(OrigPhi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
} else | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
} else | |
continue; | |
} else { | |
continue; | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks!
PHINode *createInductionResumeValue( | ||
/// Create a ResumePHI VPInstruction for the induction variable \p OrigPhi to | ||
/// resume iteration count in the scalar epilogue, from where the vectorized | ||
/// loop left off and add it the scalar preheader of the VPlan. \p Step is the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// loop left off and add it the scalar preheader of the VPlan. \p Step is the | |
/// loop left off, and add it to the scalar preheader of VPlan. \p Step is the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
/// provides information about the bypass block and the end value on the edge | ||
/// from bypass to this loop. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// provides information about the bypass block and the end value on the edge | |
/// from bypass to this loop. | |
/// provides this additional bypass block along with the resume value coming | |
/// from it. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
/// end value on the edge from bypass to this loop. | ||
PHINode *createInductionResumeValue( | ||
/// Create a ResumePHI VPInstruction for the induction variable \p OrigPhi to | ||
/// resume iteration count in the scalar epilogue, from where the vectorized |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// resume iteration count in the scalar epilogue, from where the vectorized | |
/// resume iteration count in the scalar epilogue from where the vectorized |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed, thanks!
for (VPRecipeBase &R : *ScalarLoopHeader) { | ||
auto *IRI = cast<VPIRInstruction>(&R); | ||
if (&IRI->getInstruction() == OrigPhi) { | ||
IRI->addOperand(ResumePhiRecipe); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth asserting IRI has no operands before adding one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
{Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())}, | ||
OrigPhi->getDebugLoc(), "bc.resume.val"); | ||
auto *ScalarLoopHeader = Plan.getScalarHeader(); | ||
for (VPRecipeBase &R : *ScalarLoopHeader) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of searching linearly for each induction phi in scalar loop header, have createInductionResumeValues() scan the IRI VPIRInstructions of scalar loop header which wrap inductions, passing IRI to createInductionResumeValue(), from which OrigPhi can be retrieved? More below.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
BasicBlock *PH = L->getLoopPreheader(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BasicBlock *PH = L->getLoopPreheader(); | |
// SOME COMMENT | |
BasicBlock *PH = L->getLoopPreheader(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added comment at new location, thanks!
for (const auto &[IVPhi, _] : LVL.getInductionVars()) { | ||
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); | ||
const auto &[BB, V] = EpilogILV.getInductionBypassValue(IVPhi); | ||
Inc->setIncomingValueForBlock(BB, V); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this be taken care of by LVP.executePlan() above rather than here in LoopVectorizePass::processLoop()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, moved, thanks
for (VPRecipeBase &R : | ||
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we iterate over the IRI's of the scalar loop header instead, as suggested above?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately I don't think we can here, as we don't have a mapping from IR values to VPValues (other than live-outs here)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Iterating over the IRI's of scalar header instead of the integer/fp induction header phi recipes of vector header, relieves the need for such a mapping - to find PhiR which wraps IndPhi (by searching thru the recipes of scalar header for each IndPhi), as in plural createInductionResumeValues()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code here originally tried to just create resume value for wide phis, and the scalar VPIRInstructions don't have a link to the induction phi recipes. But I replaced this now as per the suggestion above.
@@ -522,31 +522,31 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { | |||
; PRED: pred.store.continue: | |||
; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 | |||
; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] | |||
; PRED: pred.store.if3: | |||
; PRED: pred.store.if2: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a way to remove or reduce the amount of such irrelevant naming changes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to avoid additional redundant changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately I don't think so, as the numbering depends on the exact order in which names are created that need de-duplication (there's a single counter for all names)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. Just curious - are there now fewer(?) de-duplications taking place, or only a reordering of de-duplications?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to confirm this patch is effectively NFCI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tried to strop as many unrelated changes from the diffs, but also added some new ones in the ones with predicated blocks to update the block variables.
/// Mapping of induction phis to their bypass values and bypass blocks. | ||
DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is needed only for "additional" bypasses. Is there a way to avoid storing this mapping?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the moment I think this is the only place it is stored, so unfortunately there's no other way to retrieve it unless storing it here (or somewhere else in ILV)
VPIRInstruction *PhiR, PHINode *OrigPhi, const InductionDescriptor &II, | ||
Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | ||
VPBuilder &ScalarPHBuilder, | ||
std::pair<BasicBlock *, Value *> AdditionalBypass) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPIRInstruction *PhiR, PHINode *OrigPhi, const InductionDescriptor &II, | |
Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | |
VPBuilder &ScalarPHBuilder, | |
std::pair<BasicBlock *, Value *> AdditionalBypass) { | |
VPIRInstruction *PhiR, const InductionDescriptor &II, | |
Value *Step, ArrayRef<BasicBlock *> BypassBlocks, | |
VPBuilder &ScalarPHBuilder, | |
std::pair<BasicBlock *, Value *> AdditionalBypass) { | |
auto *OrigPhi = cast<PHINode>(&PhiR->getInstruction()); |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, the PHINode argument has been removed in the latest version
// Store the bypass values here, as they need to be added to their phi nodes | ||
// after the epilogue skeleton has been created. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The value for "non-additional" bypasses also need to be added after creating epilog skeleton.
This is due to ResumePhi holding only two operands (one value coming from the vectorized loop and the other value bypassing it), whereas when vectorizing the epilog loop it should have three?
Perhaps allow ResumePhi to hold a third operand, sorted "upwards", according to https://llvm.org/docs/Vectorizers.html#epilogue-vectorization?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, but maybe better done as follow up?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very well, perhaps worth leaving a TODO.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks
@@ -2676,13 +2680,14 @@ void InnerLoopVectorizer::createInductionResumeValues( | |||
// iteration in the vectorized loop. | |||
// If we come from a bypass edge then we need to start from the original | |||
// start value. | |||
VPBasicBlock *ScalarPHVPBB = Plan.getScalarPreheader(); | |||
VPBuilder ScalarPHBuilder(ScalarPHVPBB, ScalarPHVPBB->begin()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the long-term roadmap, it may be better to update eveny ResumePhi of the scalar preheader when each bypass block is introduced, to keep this introduction atomic, keeping control-flow predecessors up-to-date with data-flow phi's.
// Create induction resume values and ResumePhis for the inductions in the | ||
// epilogue loop in the VPlan for the epilogue vector loop. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sentence is somewhat cumbersome.
Below EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton()
continues to call plural createInductionResumeValues()
, for resume values of the scalar loop, with additional bypass.
Here EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton()
now calls singular createInductionResumeValue()
, repeatedly, for resume values of the epilog loop, w/o additional bypass.
Can plural createInductionResumeValues()
be called here as well, w/o additional bypass? It seems to be doing exactly what's needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It can, at the cost of creating more bypass values than needed; in particular we only need resume values for widened inductions, other inductions resume from the canonical one which will get created separately.
I updated it for now, but there are some tests which need to be updated with the redundant phis (which previously got removed). Thinking about it now, we might have to create them all here, as an induction being widened in the main loop may not be widened in the epilogue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there still more bypass values created than needed, leading to test changes?
for (VPRecipeBase &R : | ||
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Iterating over the IRI's of scalar header instead of the integer/fp induction header phi recipes of vector header, relieves the need for such a mapping - to find PhiR which wraps IndPhi (by searching thru the recipes of scalar header for each IndPhi), as in plural createInductionResumeValues()?
PHINode *InnerLoopVectorizer::createInductionResumeValue( | ||
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, | ||
ArrayRef<BasicBlock *> BypassBlocks, | ||
void InnerLoopVectorizer::createInductionResumeValue( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth renaming createInductionResumeVPValue()
?
This used to create only IR Values, in particular a PHINode in scalar preheader. Now it creates a ResumePhi recipe in scalar preheader instead, while still generating IR in vector preheader/bypass.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks!
ResumeV = MainILV.createInductionResumeValue( | ||
IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs), | ||
{EPI.MainLoopIterationCountCheck}); | ||
ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there an existing VPValue to resume from, is it not already used as the start value of header phi induction recipes of epilog loop?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I might be missing something, but the code here sets the start value for the header recipes in the epilogue. We could possibly get it from the ResumePhis in the main vector loop VPlan, but the VPValues would be defined in a different plan?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, right. Perhaps worth a note explaining the (missing) connection to ResumePhi recipes in main loop VPlan, which were executed and hooked up to the Phi nodes of the scalar loop.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something like // Hook up to the PHINode generated by a ResumePhi recipe of main loop VPlan, which feeds the scalar loop
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -522,31 +522,31 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 { | |||
; PRED: pred.store.continue: | |||
; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1 | |||
; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] | |||
; PRED: pred.store.if3: | |||
; PRED: pred.store.if2: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Trying to avoid additional redundant changes.
33308d2
to
ce214f5
Compare
(responding here as the comment button disappeared...) In some cases we create an additional resume phi, but most renames are due to the resume phis now being created after generating code for the main vector loop instead of creating them upfront (and thus bumping the name counter earlier before this patch) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"AdditionalBypassValue" is inaccurate - it serves as the additional bypass value only for the primary induction, other inductions use it to build their additional bypass value. Suggest to call it "MainVectorTripCount".
return Induction2AdditionalBypassValue.at(OrigPhi); | ||
} | ||
|
||
/// Return the additional bypass block. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// Return the additional bypass block. | |
/// Return the additional bypass block which targets the scalar loop by skipping the epilog loop after completing the main loop. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step, | ||
ArrayRef<BasicBlock *> BypassBlocks, | ||
std::pair<BasicBlock *, Value *> AdditionalBypass) { | ||
void InnerLoopVectorizer::createInductionResumeVPValue( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This method moves from generating (and returning) an IR PHINode to generating (and recording in VPlan) a ResumePhi recipe. It should therefore migrate from ILV to LVP (perhaps extending VP[Recipe]Builder). At the moment however it still generates IR Values in addition to said recipe, by calling getOrCreateVectorTripCount() and emitTransformedIndex(). These should migrate to use VPlan::getVectorTripCount() and perhaps some form of VPDerivedIVRecipe, respectively. Would be good to leave behind a TODO?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a TODO, thanks
void InnerLoopVectorizer::createInductionResumeVPValue( | ||
VPIRInstruction *InductionPhiRI, const InductionDescriptor &II, Value *Step, | ||
ArrayRef<BasicBlock *> BypassBlocks, VPBuilder &ScalarPHBuilder, | ||
Value *AdditionalBypassValue) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Value *AdditionalBypassValue) { | |
Value *MainVectorTripCount) { |
this serves as the additional bypass value for primary induction, and the basis for computing additional bypass value for other inductions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
BasicBlock *PH = OrigLoop->getLoopPreheader(); | ||
for (const auto &[IVPhi, _] : Legal->getInductionVars()) { | ||
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH)); | ||
Value *V = ILV.getInductionAdditionalBypassValue(IVPhi); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure.
Value *EndValueFromAdditionalBypass = AdditionalBypassValue; | ||
if (OrigPhi == OldInduction) { | ||
// We know what the end value is. | ||
EndValue = VectorTripCount; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Value *EndValueFromAdditionalBypass = AdditionalBypassValue; | |
if (OrigPhi == OldInduction) { | |
// We know what the end value is. | |
EndValue = VectorTripCount; | |
Value *EndValueFromAdditionalBypass = MainVectorTripCount; | |
// Otherwise compute them accordingly. | |
if (OrigPhi != OldInduction) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done thanks!
; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] | ||
; CHECK: pred.store.if2: | ||
; CHECK: pred.store.if1: | ||
; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008 | ||
; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[TMP10]], 2008 | ||
; CHECK-NEXT: [[TMP12:%.*]] = udiv i16 4943, [[TMP11]] | ||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]] | ||
; CHECK-NEXT: store i16 0, ptr [[TMP13]], align 2 | ||
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] | ||
; CHECK: pred.store.continue3: | ||
; CHECK: pred.store.continue2: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(introduced mismatches)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both mismatches still here.
; CHECK-NEXT: br label [[VECTOR_BODY33:%.*]] | ||
; CHECK: vector.body33: | ||
; CHECK: vector.body35: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mismatch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY60:%.*]] | ||
; CHECK: vec.epilog.vector.body60: | ||
; CHECK: vec.epilog.vector.body58: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mismatch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks
; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT22]], <i64 0, i64 1, i64 2, i64 3> | ||
; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT18]] | ||
; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT18]], <i64 0, i64 1, i64 2, i64 3> | ||
; CHECK-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT20]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(diff misaligned)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, seems like too many changes unfortunately
; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[OFFSET_IDX6]], 12 | ||
; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] | ||
; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[NEXT_GEP10]], align 16 | ||
; CHECK-NEXT: store i32 [[TMP17]], ptr [[NEXT_GEP5]], align 16 | ||
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE18]] | ||
; CHECK: pred.store.continue18: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(diff misaligned)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM, thanks!
Adding final comments and suggestion.
// the epilogue loop. | ||
// Generate VPValues and ResumePhi recipes for inductions in the epilogue loop | ||
// to resume from the main loop or bypass it, if there are any wide | ||
// inductions. Otherwise it is we only need a resume value for the canonical |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// inductions. Otherwise it is we only need a resume value for the canonical | |
// inductions. Otherwise we only need a resume value for the canonical |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks
// vector epilogue preheader | ||
// Generate a resume phi for the canonical induction of the vector epilogue | ||
// and put it in the vector epilogue preheader, unless such a phi already | ||
// exists there - and can be reused |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// exists there - and can be reused | |
// exists there - and can be reused |
// exists there - and can be reused | |
// exists there - and can be reused. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
@@ -10243,7 +10288,9 @@ bool LoopVectorizePass::processLoop(Loop *L) { | |||
// The first pass vectorizes the main loop and creates a scalar epilogue | |||
// to be vectorized by executing the plan (potentially with a different | |||
// factor) again shortly afterwards. | |||
EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); | |||
VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps worth commenting that BestEpiPlan is used only to avoid generating redundant instructions in the best main plan - if unused by BestEpiPlan.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated by collecting a set of IVs that need resume values and passing it, thanks
; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]] | ||
; CHECK: pred.store.if2: | ||
; CHECK: pred.store.if1: | ||
; CHECK-NEXT: [[TMP10:%.*]] = add i16 [[OFFSET_IDX]], 2008 | ||
; CHECK-NEXT: [[TMP11:%.*]] = add i16 [[TMP10]], 2008 | ||
; CHECK-NEXT: [[TMP12:%.*]] = udiv i16 4943, [[TMP11]] | ||
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[P]], i16 [[TMP12]] | ||
; CHECK-NEXT: store i16 0, ptr [[TMP13]], align 2 | ||
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]] | ||
; CHECK: pred.store.continue3: | ||
; CHECK: pred.store.continue2: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both mismatches still here.
if (any_of( | ||
EPI.EpiloguePlan.getVectorLoopRegion()->getEntryBasicBlock()->phis(), | ||
IsaPred<VPWidenIntOrFpInductionRecipe, | ||
VPWidenPointerInductionRecipe>)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth collecting all original induction phi nodes that get widen in the epilog, and pass that set to createInductionResumeValues(), rather than checking if the set is empty and gating this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done, thanks
Split off NFC part refactoring from #110577. This simplifies and clarifies induction resume value creation for bypass blocks.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/169/builds/6111 Here is the relevant piece of the build log for the reference
|
if (!isa<VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe>(&H)) | ||
continue; | ||
if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&H)) | ||
WideIVs.insert(WideIV->getPHINode()); | ||
else | ||
WideIVs.insert(cast<PHINode>(H.getVPSingleValue()->getUnderlyingValue())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Post-commit nit: early-continue seems redundant, suffice to do
if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&H))
WideIVs.insert(WideIV->getPHINode());
else if (isa<VPWidenPointerInductionRecipe>(&H))
WideIVs.insert(cast<PHINode>(H.getVPSingleValue()->getUnderlyingValue()));
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/164/builds/5274 Here is the relevant piece of the build log for the reference
|
Reverted because of crashes in llvm-test-suite when using stage 2 clang (https://llvm-compile-time-tracker.com/show_error.php?commit=4f7f71b7bccdc38f37b82981e8fa9ceb536a7016). |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/94/builds/2683 Here is the relevant piece of the build log for the reference
|
Ah unfortunate timing, I think the issue has been fixed by 1091fad (first commit that passes the build again looking at https://llvm-compile-time-tracker.com) |
Split off NFC part refactoring from llvm#110577. This simplifies and clarifies induction resume value creation for bypass blocks.
Updated ILV.createInductionResumeValues (now createInductionResumeVPValue) to directly update the VPIRInstructions wrapping the original phis with the created resume values. This is the first step towards modeling them completely in VPlan. Subsequent patches will move creation of the resume values completely into VPlan. Depends on llvm#109975. PR: llvm#110577
Split off NFC part refactoring from llvm#110577. This simplifies and clarifies induction resume value creation for bypass blocks.
Updated ILV.createInductionResumeValues (now createInductionResumeVPValue) to directly update the VPIRInstructions wrapping the original phis with the created resume values. This is the first step towards modeling them completely in VPlan. Subsequent patches will move creation of the resume values completely into VPlan. Depends on llvm#109975. PR: llvm#110577
Split off NFC part refactoring from llvm#110577. This simplifies and clarifies induction resume value creation for bypass blocks.
Updated ILV.createInductionResumeValues (now createInductionResumeVPValue) to directly update the VPIRInstructions wrapping the original phis with the created resume values. This is the first step towards modeling them completely in VPlan. Subsequent patches will move creation of the resume values completely into VPlan. Depends on llvm#109975. PR: llvm#110577
Updated ILV.createInductionResumeValues to directly update the
VPIRInstructions wrapping the original phis with the created resume
values.
This is the first step towards modeling them completely in VPlan.
Subsequent patches will move creation of the resume values completely
into VPlan.
Depends on #109975.