Skip to content

[VPlan] Simplify branch to scalar ph in VPlan transform. (NFC) #140409

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 23 additions & 37 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ class InnerLoopVectorizer {
MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
RTChecks(RTChecks), Plan(Plan),
VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {}
VectorPHVPB(Plan.getEntry()->getSuccessors()[1]) {}

virtual ~InnerLoopVectorizer() = default;

Expand Down Expand Up @@ -2365,22 +2365,19 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
if (PreVectorPH->getNumSuccessors() != 1) {
assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&
"Unexpected successor");
VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
PreVectorPH = CheckVPIRBB;
}
assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(CheckIRBB);
VPBlockUtils::insertOnEdge(PreVectorPH, VectorPHVPB, CheckVPIRBB);
PreVectorPH = CheckVPIRBB;
VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);
PreVectorPH->swapSuccessors();

// We just connected a new block to the scalar preheader. Update all
// ResumePhis by adding an incoming value for it, replicating the last value.
for (VPRecipeBase &R : *cast<VPBasicBlock>(ScalarPH)) {
auto *ResumePhi = dyn_cast<VPInstruction>(&R);
if (!ResumePhi || ResumePhi->getOpcode() != VPInstruction::ResumePhi)
auto *ResumePhi = cast<VPPhi>(&R);
if (ResumePhi->getNumIncoming() == ScalarPH->getNumPredecessors())
continue;
ResumePhi->addOperand(
ResumePhi->getOperand(ResumePhi->getNumOperands() - 1));
Expand Down Expand Up @@ -2463,9 +2460,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
LoopBypassBlocks.push_back(TCCheckBlock);

// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.
introduceCheckBlockInVPlan(TCCheckBlock);
}

BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
Expand Down Expand Up @@ -2530,7 +2524,8 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
for (auto &R : make_early_inc_range(*VPBB)) {
assert(!R.isPhi() && "Tried to move phi recipe to end of block");
assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
"Tried to move phi recipe to end of block");
R.moveBefore(*IRVPBB, IRVPBB->end());
}

Expand Down Expand Up @@ -7768,10 +7763,7 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
// over the incoming values correctly.
using namespace VPlanPatternMatch;
auto IsResumePhi = [](VPUser *U) {
auto *VPI = dyn_cast<VPInstruction>(U);
return VPI && VPI->getOpcode() == VPInstruction::ResumePhi;
};
auto IsResumePhi = [](VPUser *U) { return isa<VPPhi>(U); };
assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&
"ResumePhi must have a single user");
auto *EpiResumePhiVPI =
Expand Down Expand Up @@ -7837,7 +7829,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(

// 1. Set up the skeleton for vectorization, including vector pre-header and
// middle block. The vector loop is created during VPlan execution.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSingleSuccessor());
VPBasicBlock *VectorPH = cast<VPBasicBlock>(Entry->getSuccessors()[1]);
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
if (VectorizingEpilogue)
VPlanTransforms::removeDeadRecipes(BestVPlan);
Expand Down Expand Up @@ -8070,7 +8062,8 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

introduceCheckBlockInVPlan(TCCheckBlock);
if (!ForEpilogue)
introduceCheckBlockInVPlan(TCCheckBlock);
return TCCheckBlock;
}

Expand Down Expand Up @@ -8200,7 +8193,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
Plan.setEntry(NewEntry);
// OldEntry is now dead and will be cleaned up when the plan gets destroyed.

introduceCheckBlockInVPlan(Insert);
return Insert;
}

Expand Down Expand Up @@ -9146,9 +9138,8 @@ static VPInstruction *addResumePhiRecipeForInduction(
WideIV->getDebugLoc());
}

auto *ResumePhiRecipe =
ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},
WideIV->getDebugLoc(), "bc.resume.val");
auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
{EndValue, Start}, WideIV->getDebugLoc(), "bc.resume.val");
return ResumePhiRecipe;
}

Expand All @@ -9160,7 +9151,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
DenseMap<VPValue *, VPValue *> &IVEndValues) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
auto *ScalarPH = Plan.getScalarPreheader();
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
VPBuilder VectorPHBuilder(
cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
Expand All @@ -9177,8 +9168,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
&Plan.getVectorTripCount())) {
assert(ResumePhi->getOpcode() == VPInstruction::ResumePhi &&
"Expected a ResumePhi");
assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
IVEndValues[WideIVR] = ResumePhi->getOperand(0);
ScalarPhiIRI->addOperand(ResumePhi);
continue;
Expand All @@ -9203,8 +9193,7 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
VPInstruction::ExtractLastElement, {ResumeFromVectorLoop}, {},
"vector.recur.extract");
StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
auto *ResumePhiR = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi,
auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
{ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {}, Name);
ScalarPhiIRI->addOperand(ResumePhiR);
}
Expand Down Expand Up @@ -10406,9 +10395,7 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
VPI->setOperand(1, Freeze);
if (UpdateResumePhis)
OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
return Freeze != &U && isa<VPInstruction>(&U) &&
cast<VPInstruction>(&U)->getOpcode() ==
VPInstruction::ResumePhi;
return Freeze != &U && isa<VPPhi>(&U);
});
}
};
Expand All @@ -10421,13 +10408,12 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
// scalar (which will become vector) epilogue loop we are done. Otherwise
// create it below.
if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
m_Specific(VectorTC), m_SpecificInt(0)));
return match(&R, m_VPInstruction<Instruction::PHI>(m_Specific(VectorTC),
m_SpecificInt(0)));
}))
return;
VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi,
ScalarPHBuilder.createScalarPhi(
{VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
"vec.epilog.resume.val");
}
Expand Down
25 changes: 18 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -887,11 +887,6 @@ class VPInstruction : public VPRecipeWithIRFlags,
SLPStore,
ActiveLaneMask,
ExplicitVectorLength,
/// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
/// The first operand is the incoming value from the predecessor in VPlan,
/// the second operand is the incoming value for all other predecessors
/// (which are currently not modeled in VPlan).
ResumePhi,
CalculateTripCountMinusVF,
// Increment the canonical IV separately for each unrolled part.
CanonicalIVIncrementForPart,
Expand Down Expand Up @@ -1160,6 +1155,8 @@ class VPPhiAccessors {
return getAsRecipe()->getNumOperands();
}

void removeIncomingValue(VPBlockBase *VPB) const;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void printPhiOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
Expand All @@ -1170,11 +1167,15 @@ struct VPPhi : public VPInstruction, public VPPhiAccessors {
VPPhi(ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "")
: VPInstruction(Instruction::PHI, Operands, DL, Name) {}

static inline bool classof(const VPRecipeBase *U) {
static inline bool classof(const VPUser *U) {
auto *R = dyn_cast<VPInstruction>(U);
return R && R->getOpcode() == Instruction::PHI;
}

VPPhi *clone() override {
return new VPPhi(operands(), getDebugLoc(), getName());
}

void execute(VPTransformState &State) override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -3737,6 +3738,15 @@ VPPhiAccessors::getIncomingBlock(unsigned Idx) const {
return getAsRecipe()->getParent()->getCFGPredecessor(Idx);
}

inline void VPPhiAccessors::removeIncomingValue(VPBlockBase *VPB) const {
VPRecipeBase *R = const_cast<VPRecipeBase *>(getAsRecipe());
const VPBasicBlock *Parent = R->getParent();
assert(R->getNumOperands() == Parent->getNumPredecessors());
auto I = find(Parent->getPredecessors(), VPB);
R->getOperand(I - Parent->getPredecessors().begin())->removeUser(*R);
R->removeOperand(I - Parent->getPredecessors().begin());
}

/// A special type of VPBasicBlock that wraps an existing IR basic block.
/// Recipes of the block get added before the first non-phi instruction in the
/// wrapped block.
Expand Down Expand Up @@ -4246,7 +4256,8 @@ class VPlan {
/// that this relies on unneeded branches to the scalar tail loop being
/// removed.
bool hasScalarTail() const {
return getScalarPreheader()->getNumPredecessors() != 0;
return getScalarPreheader()->getNumPredecessors() != 0 &&
getScalarPreheader()->getSinglePredecessor() != getEntry();
}
};

Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(0));
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::Not:
case VPInstruction::ResumePhi:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::AnyOf:
Expand Down
44 changes: 21 additions & 23 deletions llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,12 @@ void VPlanTransforms::prepareForVectorization(
cast<VPBasicBlock>(HeaderVPB),
cast<VPBasicBlock>(LatchVPB), Range);
HandledUncountableEarlyExit = true;
} else {
for (VPRecipeBase &R : cast<VPIRBasicBlock>(EB)->phis()) {
if (auto *PhiR = dyn_cast<VPIRPhi>(&R))
PhiR->removeIncomingValue(Pred);
}
}

cast<VPBasicBlock>(Pred)->getTerminator()->eraseFromParent();
VPBlockUtils::disconnectBlocks(Pred, EB);
}
Expand Down Expand Up @@ -541,37 +545,31 @@ void VPlanTransforms::prepareForVectorization(
// Thus if tail is to be folded, we know we don't need to run the
// remainder and we can set the condition to true.
// 3) Otherwise, construct a runtime check.
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
// Also connect the entry block to the scalar preheader.
VPBlockUtils::connectBlocks(Plan.getEntry(), ScalarPH);
Plan.getEntry()->swapSuccessors();

if (!RequiresScalarEpilogueCheck) {
if (auto *LatchExitVPB = MiddleVPBB->getSingleSuccessor())
VPBlockUtils::disconnectBlocks(MiddleVPBB, LatchExitVPB);
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);
// The exit blocks are unreachable, remove their recipes to make sure no
// users remain that may pessimize transforms.
for (auto *EB : Plan.getExitBlocks()) {
for (VPRecipeBase &R : make_early_inc_range(*EB))
R.eraseFromParent();
}
if (MiddleVPBB->getNumSuccessors() != 2)
return;
}

// The connection order corresponds to the operands of the conditional branch,
// with the middle block already connected to the exit block.
VPBlockUtils::connectBlocks(MiddleVPBB, ScalarPH);

auto *ScalarLatchTerm = TheLoop->getLoopLatch()->getTerminator();
// Here we use the same DebugLoc as the scalar loop latch terminator instead
// of the corresponding compare because they may have ended up with
// different line numbers and we want to avoid awkward line stepping while
// debugging. Eg. if the compare has got a line number inside the loop.
VPBuilder Builder(MiddleVPBB);
VPValue *Cmp =
TailFolded
? Plan.getOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(TripCount->getType()->getContext())))
: Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(),
ScalarLatchTerm->getDebugLoc(), "cmp.n");
VPValue *Cmp;
if (TailFolded)
Cmp = Plan.getOrAddLiveIn(ConstantInt::getTrue(
IntegerType::getInt1Ty(TripCount->getType()->getContext())));
else if (!RequiresScalarEpilogueCheck)
Cmp = Plan.getOrAddLiveIn(ConstantInt::getFalse(
IntegerType::getInt1Ty(TripCount->getType()->getContext())));
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(),
ScalarLatchTerm->getDebugLoc(), "cmp.n");
Builder.createNaryOp(VPInstruction::BranchOnCond, {Cmp},
ScalarLatchTerm->getDebugLoc());
}
Expand Down
37 changes: 11 additions & 26 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -754,17 +754,6 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *Addend = State.get(getOperand(1), VPLane(0));
return Builder.CreatePtrAdd(Ptr, Addend, Name, getGEPNoWrapFlags());
}
case VPInstruction::ResumePhi: {
auto *NewPhi =
Builder.CreatePHI(State.TypeAnalysis.inferScalarType(this), 2, Name);
for (const auto &[IncVPV, PredVPBB] :
zip(operands(), getParent()->getPredecessors())) {
Value *IncV = State.get(IncVPV, /* IsScalar */ true);
BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(cast<VPBasicBlock>(PredVPBB));
NewPhi->addIncoming(IncV, PredBB);
}
return NewPhi;
}
case VPInstruction::AnyOf: {
Value *A = State.get(getOperand(0));
return Builder.CreateOrReduce(A);
Expand Down Expand Up @@ -863,8 +852,7 @@ bool VPInstruction::isVectorToScalar() const {
}

bool VPInstruction::isSingleScalar() const {
return getOpcode() == VPInstruction::ResumePhi ||
getOpcode() == Instruction::PHI;
return getOpcode() == Instruction::PHI;
}

#if !defined(NDEBUG)
Expand Down Expand Up @@ -963,7 +951,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
return true;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
Expand Down Expand Up @@ -1020,9 +1007,6 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
case VPInstruction::ResumePhi:
O << "resume-phi";
break;
case VPInstruction::ExplicitVectorLength:
O << "EXPLICIT-VECTOR-LENGTH";
break;
Expand Down Expand Up @@ -1130,15 +1114,16 @@ void VPInstructionWithType::print(raw_ostream &O, const Twine &Indent,

void VPPhi::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
assert(getParent() ==
getParent()->getPlan()->getVectorLoopRegion()->getEntry() &&
"VPInstructions with PHI opcodes must be used for header phis only "
"at the moment");
BasicBlock *VectorPH = State.CFG.VPBB2IRBB.at(getIncomingBlock(0));
Value *Start = State.get(getIncomingValue(0), VPLane(0));
PHINode *Phi = State.Builder.CreatePHI(Start->getType(), 2, getName());
Phi->addIncoming(Start, VectorPH);
State.set(this, Phi, VPLane(0));
PHINode *NewPhi = State.Builder.CreatePHI(
State.TypeAnalysis.inferScalarType(this), 2, getName());
unsigned NumIncoming =
getParent()->getNumPredecessors() == 0 ? 1 : getNumIncoming();
for (unsigned Idx = 0; Idx != NumIncoming; ++Idx) {
Value *IncV = State.get(getIncomingValue(Idx), /* IsScalar */ true);
BasicBlock *PredBB = State.CFG.VPBB2IRBB.at(getIncomingBlock(Idx));
NewPhi->addIncoming(IncV, PredBB);
}
State.set(this, NewPhi, VPLane(0));
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
Loading
Loading