Skip to content

Commit

Permalink
[VPlan] Update final exit value via VPlan.
Browse files Browse the repository at this point in the history
Model updating IV users directly in VPlan, replace fixupIVUsers.

Depends on llvm#110004,
llvm#109975 and
llvm#112145.
  • Loading branch information
fhahn committed Dec 29, 2024
1 parent 7f3428d commit 9b6fa7c
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 162 deletions.
247 changes: 98 additions & 149 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -542,11 +542,6 @@ class InnerLoopVectorizer {
protected:
friend class LoopVectorizationPlanner;

/// Set up the values of the IVs correctly when exiting the vector loop.
virtual void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, BasicBlock *MiddleBlock,
VPTransformState &State);

/// Iteratively sink the scalarized operands of a predicated instruction into
/// the block that was created for it.
void sinkScalarOperands(Instruction *PredInst);
Expand Down Expand Up @@ -775,10 +770,6 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;

void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
Value *VectorTripCount, BasicBlock *MiddleBlock,
VPTransformState &State) override {};
};

// A specialized derived class of inner loop vectorizer that performs
Expand Down Expand Up @@ -2751,97 +2742,6 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
return LoopVectorPreHeader;
}

// Fix up external users of the induction variable. At this point, we are
// in LCSSA form, with all external PHIs that use the IV having one input value,
// coming from the remainder loop. We need those PHIs to also have a correct
// value for the IV when arriving directly from the middle block.
void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
const InductionDescriptor &II,
Value *VectorTripCount,
BasicBlock *MiddleBlock,
VPTransformState &State) {
// There are two kinds of external IV usages - those that use the value
// computed in the last iteration (the PHI) and those that use the penultimate
// value (the value that feeds into the phi from the loop latch).
// We allow both, but they, obviously, have different values.

DenseMap<Value *, Value *> MissingVals;

Value *EndValue = cast<PHINode>(OrigPhi->getIncomingValueForBlock(
OrigLoop->getLoopPreheader()))
->getIncomingValueForBlock(MiddleBlock);

// An external user of the last iteration's value should see the value that
// the remainder loop uses to initialize its own IV.
Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
for (User *U : PostInc->users()) {
Instruction *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
MissingVals[UI] = EndValue;
}
}

// An external user of the penultimate value need to see EndValue - Step.
// The simplest way to get this is to recompute it from the constituent SCEVs,
// that is Start + (Step * (CRD - 1)).
for (User *U : OrigPhi->users()) {
auto *UI = cast<Instruction>(U);
if (!OrigLoop->contains(UI)) {
assert(isa<PHINode>(UI) && "Expected LCSSA form");
IRBuilder<> B(MiddleBlock->getTerminator());

// Fast-math-flags propagate from the original induction instruction.
if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
assert(StepVPV && "step must have been expanded during VPlan execution");
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
: State.get(StepVPV, VPLane(0));
Value *Escape = nullptr;
if (EndValue->getType()->isIntegerTy())
Escape = B.CreateSub(EndValue, Step);
else if (EndValue->getType()->isPointerTy())
Escape = B.CreatePtrAdd(EndValue, B.CreateNeg(Step));
else {
assert(EndValue->getType()->isFloatingPointTy() &&
"Unexpected induction type");
Escape = B.CreateBinOp(II.getInductionBinOp()->getOpcode() ==
Instruction::FAdd
? Instruction::FSub
: Instruction::FAdd,
EndValue, Step);
}
Escape->setName("ind.escape");
MissingVals[UI] = Escape;
}
}

assert((MissingVals.empty() ||
all_of(MissingVals,
[MiddleBlock, this](const std::pair<Value *, Value *> &P) {
return all_of(
predecessors(cast<Instruction>(P.first)->getParent()),
[MiddleBlock, this](BasicBlock *Pred) {
return Pred == MiddleBlock ||
Pred == OrigLoop->getLoopLatch();
});
})) &&
"Expected escaping values from latch/middle.block only");

for (auto &I : MissingVals) {
PHINode *PHI = cast<PHINode>(I.first);
// One corner case we have to handle is two IVs "chasing" each-other,
// that is %IV2 = phi [...], [ %IV1, %latch ]
// In this case, if IV1 has an external use, we need to avoid adding both
// "last value of IV1" and "penultimate value of IV2". So, verify that we
// don't already have an incoming value for the middle block.
if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
PHI->addIncoming(I.second, MiddleBlock);
}
}

namespace {

struct CSEDenseMapInfo {
Expand Down Expand Up @@ -2986,24 +2886,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
for (PHINode &PN : Exit->phis())
PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);

if (Cost->requiresScalarEpilogue(VF.isVector())) {
// No edge from the middle block to the unique exit block has been inserted
// and there is nothing to fix from vector loop; phis should have incoming
// from scalar loop only.
} else {
// TODO: Check in VPlan to see if IV users need fixing instead of checking
// the cost model.

// If we inserted an edge from the middle block to the unique exit block,
// update uses outside the loop (phis) to account for the newly inserted
// edge.

// Fix-up external users of the induction variables.
for (const auto &Entry : Legal->getInductionVars())
fixupIVUsers(Entry.first, Entry.second,
getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
}

for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);

Expand Down Expand Up @@ -8857,11 +8739,10 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
/// the end value of the induction.
static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
VPBuilder &VectorPHBuilder,
VPBuilder &ScalarPHBuilder,
VPTypeAnalysis &TypeInfo,
VPValue *VectorTC) {
static VPValue *addResumePhiRecipeForInduction(
VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC,
DenseMap<VPValue *, VPValue *> &EndValues) {
auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
// Truncated wide inductions resume from the last lane of their vector value
// in the last vector iteration which is handled elsewhere.
Expand All @@ -8886,6 +8767,7 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
ScalarTypeOfWideIV);
}

EndValues[WideIV] = EndValue;
auto *ResumePhiRecipe =
ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
WideIV->getDebugLoc(), "bc.resume.val");
Expand All @@ -8895,7 +8777,9 @@ static VPValue *addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV,
/// Create resume phis in the scalar preheader for first-order recurrences,
/// reductions and inductions, and update the VPIRInstructions wrapping the
/// original phis in the scalar header.
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
Loop *OrigLoop,
DenseMap<VPValue *, VPValue *> &EndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
Expand All @@ -8915,7 +8799,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
if (VPValue *ResumePhi = addResumePhiRecipeForInduction(
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
&Plan.getVectorTripCount())) {
&Plan.getVectorTripCount(), EndValues)) {
ScalarPhiIRI->addOperand(ResumePhi);
continue;
}
Expand Down Expand Up @@ -8949,9 +8833,9 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
// modeled explicitly yet and won't be included. Those are un-truncated
// VPWidenIntOrFpInductionRecipe, VPWidenPointerInductionRecipe and induction
// increments.
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
static SetVector<VPIRInstruction *>
collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
VPlan &Plan) {
auto *MiddleVPBB = Plan.getMiddleBlock();
SetVector<VPIRInstruction *> ExitUsersToFix;
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
Expand All @@ -8976,18 +8860,6 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
// Exit values for inductions are computed and updated outside of VPlan
// and independent of induction recipes.
// TODO: Compute induction exit values in VPlan.
if ((isa<VPWidenIntOrFpInductionRecipe>(V) &&
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
(isa<Instruction>(IncomingValue) &&
OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
any_of(IncomingValue->users(), [&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
}))) {
if (ExitVPBB->getSinglePredecessor() == MiddleVPBB)
continue;
}
ExitUsersToFix.insert(ExitIRI);
ExitIRI->addOperand(V);
}
Expand All @@ -8996,17 +8868,86 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
return ExitUsersToFix;
}

/// If \p Incoming is a user of a non-truncated induction, create recipes to
/// compute the final value and update the user \p ExitIRI.
static bool addInductionEndValue(
VPlan &Plan, VPIRInstruction *ExitIRI, VPValue *Incoming,
const MapVector<PHINode *, InductionDescriptor> &Inductions,
DenseMap<VPValue *, VPValue *> &EndValues, VPTypeAnalysis &TypeInfo) {
if ((isa<VPWidenIntOrFpInductionRecipe>(Incoming) &&
!cast<VPWidenIntOrFpInductionRecipe>(Incoming)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(Incoming) ||
(isa<Instruction>(Incoming->getUnderlyingValue()) &&
any_of(cast<Instruction>(Incoming->getUnderlyingValue())->users(),
[&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
}))) {
VPValue *IV;
if (auto *WideIV =
dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()))
IV = WideIV;
else if (auto *WideIV =
dyn_cast<VPWidenInductionRecipe>(Incoming->getDefiningRecipe()
->getOperand(0)
->getDefiningRecipe()))
IV = WideIV;
else
IV = Incoming->getDefiningRecipe()->getOperand(1);
// Skip phi nodes already updated. This can be the case if 2 induction
// phis chase each other.
VPValue *EndValue = EndValues[IV];
if (any_of(cast<VPRecipeBase>(Incoming->getDefiningRecipe())->operands(),
IsaPred<VPWidenIntOrFpInductionRecipe,
VPWidenPointerInductionRecipe>)) {
ExitIRI->setOperand(0, EndValue);
return true;
}

VPBuilder B(Plan.getMiddleBlock()->getTerminator());
VPValue *Escape = nullptr;
auto *WideIV = cast<VPWidenInductionRecipe>(IV->getDefiningRecipe());
VPValue *Step = WideIV->getStepValue();
Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
if (ScalarTy->isIntegerTy())
Escape =
B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
else if (ScalarTy->isPointerTy())
Escape = B.createPtrAdd(
EndValue,
B.createNaryOp(Instruction::Sub,
{Plan.getOrAddLiveIn(ConstantInt::get(
Step->getLiveInIRValue()->getType(), 0)),
Step}),
{}, "ind.escape");
else if (ScalarTy->isFloatingPointTy()) {
const auto &ID = WideIV->getInductionDescriptor();
Escape = B.createNaryOp(
ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
? Instruction::FSub
: Instruction::FAdd,
{EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
} else {
llvm_unreachable("all possible induction types must be handled");
}
ExitIRI->setOperand(0, Escape);
return true;
}
return false;
}
// Add exit values to \p Plan. Extracts are added for each entry in \p
// ExitUsersToFix if needed and their operands are updated. Returns true if all
// exit users can be handled, otherwise return false.
static bool
addUsersInExitBlocks(VPlan &Plan,
const SetVector<VPIRInstruction *> &ExitUsersToFix) {
static bool addUsersInExitBlocks(
VPlan &Plan, const SetVector<VPIRInstruction *> &ExitUsersToFix,
const MapVector<PHINode *, InductionDescriptor> &Inductions,
DenseMap<VPValue *, VPValue *> &EndValues) {
if (ExitUsersToFix.empty())
return true;

auto *MiddleVPBB = Plan.getMiddleBlock();
VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());

// Introduce extract for exiting values and update the VPIRInstructions
// modeling the corresponding LCSSA phis.
Expand All @@ -9022,11 +8963,16 @@ addUsersInExitBlocks(VPlan &Plan,
if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
return false;

VPValue *Incoming = ExitIRI->getOperand(0);
if (addInductionEndValue(Plan, ExitIRI, Incoming, Inductions, EndValues,
TypeInfo))
continue;

LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
{Op, Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Ctx, 32), 1))});
ExitIRI->setOperand(Idx, Ext);
ExitIRI->setOperand(0, Ext);
}
}
return true;
Expand Down Expand Up @@ -9307,11 +9253,13 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPlanTransforms::handleUncountableEarlyExit(
*Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
}
addScalarResumePhis(RecipeBuilder, *Plan);
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
DenseMap<VPValue *, VPValue *> EndValues;
addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);
SetVector<VPIRInstruction *> ExitUsersToFix =
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
if (!addUsersInExitBlocks(*Plan, ExitUsersToFix, Legal->getInductionVars(),
EndValues)) {
reportVectorizationFailure(
"Some exit values in loop with uncountable exit not supported yet",
"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
Expand Down Expand Up @@ -9438,7 +9386,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
}
addScalarResumePhis(RecipeBuilder, *Plan);
DenseMap<VPValue *, VPValue *> EndValues;
addScalarResumePhis(RecipeBuilder, *Plan, OrigLoop, EndValues);

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
Expand Down
24 changes: 14 additions & 10 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,16 +315,20 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
LastLane = 0;
}

auto *LastInst = cast<Instruction>(get(Def, LastLane));
// Set the insert point after the last scalarized instruction or after the
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
// will directly follow the scalar definitions.
auto *LastDef = get(Def, LastLane);
auto OldIP = Builder.saveIP();
auto NewIP =
isa<PHINode>(LastInst)
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
: std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
if (auto *LastInst = dyn_cast<Instruction>(LastDef)) {
// TODO: Remove once VPDerivedIVReicpe can be simplified, which requires
// vector trip count being modeled in VPlan.
// Set the insert point after the last scalarized instruction or after the
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
// will directly follow the scalar definitions.
auto NewIP =
isa<PHINode>(LastInst)
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
: std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
}

// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
Expand All @@ -339,7 +343,7 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
} else {
// Initialize packing with insertelements to start from undef.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
Value *Undef = PoisonValue::get(VectorType::get(LastDef->getType(), VF));
set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(Def, Lane);
Expand Down
Loading

0 comments on commit 9b6fa7c

Please sign in to comment.