Skip to content

Commit

Permalink
[VPlan] Remove loop region in optimizeForVFAndUF.
Browse files Browse the repository at this point in the history
  • Loading branch information
fhahn committed Oct 10, 2024
1 parent d3614bc commit 5f8fabe
Show file tree
Hide file tree
Showing 16 changed files with 359 additions and 426 deletions.
75 changes: 44 additions & 31 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2927,6 +2927,9 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
for (const auto &KV : Plan.getLiveOuts())
KV.second->fixPhi(Plan, State);

if (!isa<VPRegionBlock>(State.Plan->getEntry()->getSingleSuccessor()))
return;

for (Instruction *PI : PredicatedInstructions)
sinkScalarOperands(&*PI);

Expand Down Expand Up @@ -7537,7 +7540,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
LLVM_DEBUG(BestVPlan.dump());

// Perform the actual loop transformation.
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan);
VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
Legal->getWidestInductionType());

// 0. Generate SCEV-dependent code into the preheader, including TripCount,
// before making any changes to the CFG.
Expand Down Expand Up @@ -7598,14 +7602,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan.execute(&State);

// 2.5 Collect reduction resume values.
auto *ExitVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
if (IsEpilogueVectorization)
if (IsEpilogueVectorization) {
auto *ExitVPBB = cast<VPBasicBlock>(
BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
updateAndCollectMergePhiForReductionForEpilogueVectorization(
dyn_cast<VPInstruction>(&R), State, OrigLoop,
State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
}
}

// 2.6. Maintain Loop Hints
// Keep all loop hints from the original loop on the vector loop (we'll
Expand All @@ -7616,24 +7621,26 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
LLVMLoopVectorizeFollowupVectorized});

VPBasicBlock *HeaderVPBB =
BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
if (VectorizedLoopID)
L->setLoopID(*VectorizedLoopID);
else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
L->setLoopID(LID);

LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
if (auto *R =
dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
VPBasicBlock *HeaderVPBB = R->getEntryBasicBlock();
Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
if (VectorizedLoopID)
L->setLoopID(*VectorizedLoopID);
else {
// Keep all loop hints from the original loop on the vector loop (we'll
// replace the vectorizer-specific hints below).
if (MDNode *LID = OrigLoop->getLoopID())
L->setLoopID(LID);

LoopVectorizeHints Hints(L, true, *ORE);
Hints.setAlreadyVectorized();
}
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
addRuntimeUnrollDisableMetaData(L);
}
TargetTransformInfo::UnrollingPreferences UP;
TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
addRuntimeUnrollDisableMetaData(L);

// 3. Fix the vectorized code: take care of header phi's, live-outs,
// predication, updating analyses.
Expand All @@ -7642,15 +7649,20 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.printDebugTracesAtEnd();

// 4. Adjust branch weight of the branch in the middle block.
auto *MiddleTerm =
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
if (MiddleTerm->isConditional() &&
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
// Assume that `Count % VectorTripCount` is equally distributed.
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
assert(TripCount > 0 && "trip count should not be zero");
const uint32_t Weights[] = {1, TripCount - 1};
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
if (auto *R =
dyn_cast<VPRegionBlock>(BestVPlan.getEntry()->getSingleSuccessor())) {
auto *ExitVPBB = cast<VPBasicBlock>(R->getSingleSuccessor());

auto *MiddleTerm =
cast<BranchInst>(State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
if (MiddleTerm->isConditional() &&
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
// Assume that `Count % VectorTripCount` is equally distributed.
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
assert(TripCount > 0 && "trip count should not be zero");
const uint32_t Weights[] = {1, TripCount - 1};
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
}
}

return State.ExpandedSCEVs;
Expand Down Expand Up @@ -9464,7 +9476,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&
"IV didn't need transforming?");

State.set(this, DerivedIV, VPLane(0));
}
Expand Down
143 changes: 68 additions & 75 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,9 +224,10 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {

VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan)
InnerLoopVectorizer *ILV, VPlan *Plan,
Type *CanonicalIVTy)
: VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}
LVer(nullptr), TypeAnalysis(CanonicalIVTy) {}

Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
if (Def->isLiveIn())
Expand Down Expand Up @@ -275,8 +276,8 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
// Place the code for broadcasting invariant variables in the new preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (SafeToHoist) {
BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
Plan->getVectorLoopRegion()->getSinglePredecessor())];
BasicBlock *LoopVectorPreHeader =
CFG.VPBB2IRBB[cast<VPBasicBlock>(Plan->getEntry())];
if (LoopVectorPreHeader)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
}
Expand Down Expand Up @@ -417,6 +418,12 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
PrevBB->getParent(), CFG.ExitBB);
LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');

connectToPredecessors(NewBB, CFG);
return NewBB;
}

void VPBasicBlock::connectToPredecessors(BasicBlock *NewBB,
VPTransformState::CFGState &CFG) {
// Hook up the new basic block to its predecessors.
for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
Expand Down Expand Up @@ -447,38 +454,14 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
}
CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, NewBB}});
}
return NewBB;
}

void VPIRBasicBlock::execute(VPTransformState *State) {
assert(getHierarchicalSuccessors().size() <= 2 &&
"VPIRBasicBlock can have at most two successors at the moment!");
State->Builder.SetInsertPoint(getIRBasicBlock()->getTerminator());
executeRecipes(State, getIRBasicBlock());
if (getSingleSuccessor()) {
assert(isa<UnreachableInst>(getIRBasicBlock()->getTerminator()));
auto *Br = State->Builder.CreateBr(getIRBasicBlock());
Br->setOperand(0, nullptr);
getIRBasicBlock()->getTerminator()->eraseFromParent();
}

for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
VPBasicBlock *PredVPBB = PredVPBlock->getExitingBasicBlock();
BasicBlock *PredBB = State->CFG.VPBB2IRBB[PredVPBB];
assert(PredBB && "Predecessor basic-block not found building successor.");
LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');

auto *PredBBTerminator = PredBB->getTerminator();
auto *TermBr = cast<BranchInst>(PredBBTerminator);
// Set each forward successor here when it is created, excluding
// backedges. A backward successor is set when the branch is created.
const auto &PredVPSuccessors = PredVPBB->getHierarchicalSuccessors();
unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
assert(!TermBr->getSuccessor(idx) &&
"Trying to reset an existing successor block.");
TermBr->setSuccessor(idx, IRBB);
State->CFG.DTU.applyUpdates({{DominatorTree::Insert, PredBB, IRBB}});
}
connectToPredecessors(getIRBasicBlock(), State->CFG);
}

void VPBasicBlock::execute(VPTransformState *State) {
Expand Down Expand Up @@ -962,7 +945,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,

IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
unsigned UF = getUF();
if (VF.getNumUsers()) {
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
Expand Down Expand Up @@ -1034,8 +1016,13 @@ void VPlan::execute(VPTransformState *State) {
// skeleton creation, so we can only create the VPIRBasicBlocks now during
// VPlan execution rather than earlier during VPlan construction.
BasicBlock *MiddleBB = State->CFG.ExitBB;
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(getVectorLoopRegion()->getSingleSuccessor());
VPBlockBase *Leaf = nullptr;
for (VPBlockBase *VPB : vp_depth_first_shallow(getEntry()))
if (VPB->getNumSuccessors() == 0) {
Leaf = VPB;
break;
}
VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(Leaf->getSinglePredecessor());
// Find the VPBB for the scalar preheader, relying on the current structure
// when creating the middle block and its successrs: if there's a single
// predecessor, it must be the scalar preheader. Otherwise, the second
Expand Down Expand Up @@ -1063,53 +1050,59 @@ void VPlan::execute(VPTransformState *State) {
for (VPBlockBase *Block : vp_depth_first_shallow(Entry))
Block->execute(State);

VPBasicBlock *LatchVPBB = getVectorLoopRegion()->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];

// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
// Skip phi-like recipes that generate their backedege values themselves.
if (isa<VPWidenPHIRecipe>(&R))
continue;

if (isa<VPWidenPointerInductionRecipe>(&R) ||
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
PHINode *Phi = nullptr;
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
} else {
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
"recipe generating only scalars should have been replaced");
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
Phi = cast<PHINode>(GEP->getPointerOperand());
}

Phi->setIncomingBlock(1, VectorLatchBB);
if (auto *LoopRegion =
dyn_cast<VPRegionBlock>(getEntry()->getSingleSuccessor())) {
VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
BasicBlock *VectorLatchBB = State->CFG.VPBB2IRBB[LatchVPBB];

// Fix the latch value of canonical, reduction and first-order recurrences
// phis in the vector loop.
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
for (VPRecipeBase &R : Header->phis()) {
// Skip phi-like recipes that generate their backedege values themselves.
if (isa<VPWidenPHIRecipe>(&R))
continue;

// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
if (isa<VPWidenPointerInductionRecipe>(&R) ||
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
PHINode *Phi = nullptr;
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
} else {
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
"recipe generating only scalars should have been replaced");
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
Phi = cast<PHINode>(GEP->getPointerOperand());
}

Phi->setIncomingBlock(1, VectorLatchBB);

// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());

// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
continue;
}

// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand()));
continue;
// For canonical IV, first-order recurrences and in-order reduction phis,
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
bool NeedsScalar =
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
Value *Phi = State->get(PhiR, NeedsScalar);
Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}

auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
bool NeedsScalar =
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
Value *Phi = State->get(PhiR, NeedsScalar);
Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}

State->CFG.DTU.flush();
assert(State->CFG.DTU.getDomTree().verify(
DominatorTree::VerificationLevel::Fast) &&
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ class VPLane {
struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan);
InnerLoopVectorizer *ILV, VPlan *Plan, Type *CanonicalIVTy);

/// The chosen Vectorization Factor of the loop being vectorized.
ElementCount VF;
Expand Down Expand Up @@ -3378,6 +3378,8 @@ class VPBasicBlock : public VPBlockBase {
protected:
/// Execute the recipes in the IR basic block \p BB.
void executeRecipes(VPTransformState *State, BasicBlock *BB);
void connectToPredecessors(BasicBlock *NewBB,
VPTransformState::CFGState &CFG);

private:
/// Create an IR BasicBlock to hold the output instructions generated by this
Expand Down Expand Up @@ -3499,6 +3501,7 @@ class VPRegionBlock : public VPBlockBase {
assert(!isReplicator() && "should only get pre-header of loop regions");
return getSinglePredecessor()->getExitingBasicBlock();
}
void clearEntry() { Entry = nullptr; }

/// An indicator whether this region is to generate multiple replicated
/// instances of output IR corresponding to its VPBlockBases.
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,9 @@ bool VPRecipeBase::mayHaveSideEffects() const {

void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
VPValue *ExitValue = getOperand(0);
VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
auto *Region = dyn_cast<VPRegionBlock>(Plan.getEntry()->getSingleSuccessor());
VPBasicBlock *MiddleVPBB = dyn_cast_or_null<VPBasicBlock>(
Region ? Region->getSingleSuccessor() : nullptr);
VPRecipeBase *ExitingRecipe = ExitValue->getDefiningRecipe();
auto *ExitingVPBB = ExitingRecipe ? ExitingRecipe->getParent() : nullptr;
// Values leaving the vector loop reach live out phi's in the exiting block
Expand Down Expand Up @@ -2208,7 +2209,9 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
// Replace the temporary unreachable terminator with a new conditional branch,
// whose two destinations will be set later when they are created.
auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
assert(isa<UnreachableInst>(CurrentTerminator) &&
assert((isa<UnreachableInst>(CurrentTerminator) ||
(isa<BranchInst>(CurrentTerminator) &&
!CurrentTerminator->getOperand(0))) &&
"Expected to replace unreachable terminator with conditional branch.");
auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
CondBr->setSuccessor(0, nullptr);
Expand Down
Loading

0 comments on commit 5f8fabe

Please sign in to comment.