Skip to content

[LV] Optionally preserve uniform branches when vectorizing #128187

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 160 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,11 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));

static cl::opt<bool> TryToKeepUnifromBranches(
"vect-keep-uniform-branches", cl::init(false), cl::Hidden,
cl::desc("Enable preservation of uniform branch conditions "
"when vectorizing."));

// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
Expand Down Expand Up @@ -2932,7 +2937,7 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,

void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
// Fix widened non-induction PHIs by setting up the PHI operands.
if (EnableVPlanNativePath)
if (EnableVPlanNativePath || TryToKeepUnifromBranches)
fixNonInductionPHIs(State);

// After vectorization, the exit blocks of the original loop will have
Expand Down Expand Up @@ -7577,6 +7582,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
BestPlan.getVectorLoopRegion()->getSingleSuccessor() !=
BestPlan.getMiddleBlock();
assert((BestFactor.Width == LegacyVF.Width || PlanForEarlyExitLoop ||
TryToKeepUnifromBranches ||
planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
CostCtx, OrigLoop) ||
planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),
Expand Down Expand Up @@ -9295,6 +9301,106 @@ static void addExitUsersForFirstOrderRecurrences(
}
}

/// Given a VPlan Dominator Tree \p DT that represents the CFG before
/// if-conversion and a block with a conditional branch \p VPBB,
/// find the basic block where the two distinct (but possibly empty)
/// single-exit single-entry subregions on the two sides of that branch
/// join back together, as well as the blocks exiting the two subregions.
/// The join block has to have only these two predecessors.
/// If no such join block and regions was found, return std::nullopt.
static std::optional<std::tuple<VPBlockBase *, VPBlockBase *, VPBlockBase *>>
canKeepBranchDuringIfConversion(const VPDominatorTree &DT, VPBasicBlock *VPBB,
VPlanHCFGBuilder &HCFGBuilder) {
const VPRegionBlock *Region = VPBB->getParent();
auto FindSubregionExit =
[&](VPBasicBlock *Pred,
VPBlockBase *Entry) -> std::pair<VPBlockBase *, VPBlockBase *> {
// The branch preservation is restricted to cases where
// the SESEs are completely empty or have a dedicated entry and exit.
// Because of the way the VPlan is flattened, the entry could already
// have gotten predecessors removed, so check based on the IR.
if (HCFGBuilder.getIRBBForVPB(Entry)->hasNPredecessorsOrMore(2))
return {Pred, Entry};

// Build the biggest possible SESE with the entry Entry.
// As the DT is not updated during flattening, even if other edges
// entering the SESE would have already been removed, the fact
// that there used to be one will be detected.
VPBlockBase *Exiting = nullptr;
SmallSetVector<VPBlockBase *, 4> Worklist;
Worklist.insert(Entry);
for (unsigned I = 0; I < Worklist.size(); I++) {
auto *BB = Worklist[I];
assert(BB->getParent() == Region);
for (auto *Succ : BB->getSuccessors()) {
if (DT.dominates(Entry, Succ))
Worklist.insert(Succ);
else if (Exiting || BB->getNumSuccessors() != 1)
return {nullptr, nullptr};
else
Exiting = BB;
}
}

return {Exiting, Exiting->getSingleSuccessor()};
};

auto [LHSExiting, LHSSucc] =
FindSubregionExit(VPBB, VPBB->getSuccessors()[0]);
auto [RHSExiting, RHSSucc] =
FindSubregionExit(VPBB, VPBB->getSuccessors()[1]);
if (!LHSExiting || !RHSExiting || LHSSucc != RHSSucc ||
LHSSucc->getNumPredecessors() != 2)
return std::nullopt;

return std::tuple(LHSSucc, LHSExiting, RHSExiting);
}

/// Given a basic block \p BranchBB, \p JoinBB, and a pair of blocks
/// that represent the original successor of \p BranchBB and exits
/// (or the \p BranchBB in case of a direct jump to \p JoinBB) of the
/// single-entry single-exit subregions, introduce the branch
/// back into the control flow.
static void reconnectVPlanCFGForPreservedBranch(
VPBasicBlock *BranchBB, std::pair<VPBlockBase *, VPBlockBase *> LeftSESE,
std::pair<VPBlockBase *, VPBlockBase *> RightSESE, VPBasicBlock *JoinBB,
VPRecipeBuilder &RecipeBuilder, VPlanHCFGBuilder &HCFGBuilder) {

// Disconnect the entries/exits of the regions from their RPO
// predecessors/successors, and then re-connect them.
for (auto [Entry, Exiting] : {LeftSESE, RightSESE}) {
if (auto *Pred = Entry->getSinglePredecessor())
VPBlockUtils::disconnectBlocks(Pred, Entry);
if (auto *Succ = Exiting->getSingleSuccessor())
VPBlockUtils::disconnectBlocks(Exiting, Succ);
}
for (auto [Entry, Exiting] : {LeftSESE, RightSESE})
if (Exiting == BranchBB) {
VPBlockUtils::connectBlocks(BranchBB, JoinBB);
} else {
VPBlockUtils::connectBlocks(BranchBB, Entry);
VPBlockUtils::connectBlocks(Exiting, JoinBB);
}

// The mask of the join block is that of the block with the branch.
RecipeBuilder.setBlockInMask(
HCFGBuilder.getIRBBForVPB(JoinBB),
RecipeBuilder.getBlockInMask(HCFGBuilder.getIRBBForVPB(BranchBB)));

// Make sure the phi nodes in JoinBB are not replaced by blends.
for (auto &R : JoinBB->phis()) {
auto *Phi = cast<VPWidenPHIRecipe>(&R);
auto *IRPhi = cast<PHINode>(Phi->getUnderlyingValue());
RecipeBuilder.setRecipe(IRPhi, Phi);
Phi->setOperand(
0, RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValueForBlock(
HCFGBuilder.getIRBBForVPB(LeftSESE.second))));
Phi->setOperand(
1, RecipeBuilder.getVPValueOrAddLiveIn(IRPhi->getIncomingValueForBlock(
HCFGBuilder.getIRBBForVPB(RightSESE.second))));
}
}

VPlanPtr
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

Expand Down Expand Up @@ -9390,6 +9496,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

auto *MiddleVPBB = Plan->getMiddleBlock();

// A map of the block where the sub-regions on the left and right side
// of a perservable uniform branch join back together.
DenseMap<VPBlockBase *,
std::tuple<VPBasicBlock *, std::pair<VPBlockBase *, VPBlockBase *>,
std::pair<VPBlockBase *, VPBlockBase *>>>
PreservableUniformBranches;

// Purposefully not updated during construction:
VPDominatorTree VPDT;
if (TryToKeepUnifromBranches)
VPDT.recalculate(*Plan);

// Scan the body of the loop in a topological order to visit each basic block
// after having visited its predecessor basic blocks.
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
Expand All @@ -9398,6 +9516,17 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
VPBlockBase *PrevVPBB = nullptr;
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
// Handle a block where a preservable uniform branch joins back together.
bool KeepPhis = false;
if (auto Iter = PreservableUniformBranches.find(VPBB);
Iter != PreservableUniformBranches.end()) {
auto [BranchBB, LeftSESE, RightSESE] = Iter->second;
reconnectVPlanCFGForPreservedBranch(BranchBB, LeftSESE, RightSESE, VPBB,
RecipeBuilder, HCFGBuilder);
PrevVPBB = nullptr;
KeepPhis = true;
}

// Handle VPBBs down to the latch.
if (VPBB == LoopRegion->getExiting()) {
assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
Expand All @@ -9408,15 +9537,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

// Create mask based on the IR BB corresponding to VPBB.
// TODO: Predicate directly based on VPlan.
BasicBlock *IRBB = HCFGBuilder.getIRBBForVPB(VPBB);
Builder.setInsertPoint(VPBB, VPBB->begin());
if (VPBB == HeaderVPBB) {
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
RecipeBuilder.createHeaderMask();
} else if (NeedsMasks) {
} else if (NeedsMasks && !RecipeBuilder.hasBlockInMask(IRBB)) {
// FIXME: At the moment, masks need to be placed at the beginning of the
// block, as blends introduced for phi nodes need to use it. The created
// blends should be sunk after the mask recipes.
RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
RecipeBuilder.createBlockInMask(IRBB);
}

// Convert input VPInstructions to widened recipes.
Expand All @@ -9429,14 +9559,40 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// FIXME: Migrate code relying on the underlying instruction from VPlan0
// to construct recipes below to not use the underlying instruction.
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
(isa<VPInstruction>(&R) && !UnderlyingValue))
(isa<VPInstruction>(&R) && !UnderlyingValue) ||
(isa<VPWidenPHIRecipe>(&R) && KeepPhis))
continue;

// FIXME: VPlan0, which models a copy of the original scalar loop, should
// not use VPWidenPHIRecipe to model the phis.
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
UnderlyingValue && "unsupported recipe");

// Check if the branch can be kept, and if so, remember it for so that the
// CFG can be reconnected later and set the successor masks.
if (auto *Br = dyn_cast<VPInstruction>(&R);
Br && Br->getOpcode() == VPInstruction::BranchOnCond &&
TryToKeepUnifromBranches &&
Legal->isInvariant(Br->getUnderlyingInstr()->getOperand(0))) {
if (auto JoinInfo =
canKeepBranchDuringIfConversion(VPDT, VPBB, HCFGBuilder)) {
auto *IRBr = cast<BranchInst>(UnderlyingValue);
Br->setOperand(
0, RecipeBuilder.getVPValueOrAddLiveIn(IRBr->getCondition()));
VPValue *Mask = RecipeBuilder.getBlockInMask(IRBr->getParent());
RecipeBuilder.setBlockInMask(IRBr->getSuccessor(0), Mask);
RecipeBuilder.setBlockInMask(IRBr->getSuccessor(1), Mask);
auto [JoinBB, LHSExiting, RHSExiting] = JoinInfo.value();
PreservableUniformBranches[JoinBB] = {
VPBB,
{VPBB->getSuccessors()[0], LHSExiting},
{VPBB->getSuccessors()[1], RHSExiting}};
LLVM_DEBUG(dbgs() << "LV: Preserving uniform branch: "; Br->dump();
dbgs() << ", joins at: " << JoinBB->getName() << "\n");
break;
}
}

if (isa<VPInstruction>(&R) &&
(cast<VPInstruction>(&R)->getOpcode() ==
VPInstruction::BranchOnCond ||
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,16 @@ class VPRecipeBuilder {
/// Returns the *entry* mask for the block \p BB.
VPValue *getBlockInMask(BasicBlock *BB) const;

/// Set the block entry block mask for \p BB to \p Mask.
void setBlockInMask(BasicBlock *BB, VPValue *Mask) {
BlockMaskCache[BB] = Mask;
}

/// Return true of there already is a entry block mask for \p BB.
bool hasBlockInMask(const BasicBlock *BB) const {
return BlockMaskCache.contains(BB);
}

/// Create an edge mask for every destination of cases and/or default.
void createSwitchEdgeMasks(SwitchInst *SI);

Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1974,7 +1974,11 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe {
}

VPWidenPHIRecipe *clone() override {
llvm_unreachable("cloning not implemented yet");
auto *Phi = new VPWidenPHIRecipe(
dyn_cast_if_present<PHINode>(getUnderlyingValue()));
for (unsigned I = 0; I < getNumOperands(); I++)
Phi->addOperand(getIncomingValue(I));
return Phi;
}

~VPWidenPHIRecipe() override = default;
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3643,9 +3643,6 @@ VPBasicBlock *VPWidenPHIRecipe::getIncomingBlock(unsigned I) {
}

void VPWidenPHIRecipe::execute(VPTransformState &State) {
assert(EnableVPlanNativePath &&
"Non-native vplans are not expected to have VPWidenPHIRecipes.");

State.setDebugLocFrom(getDebugLoc());
Value *Op0 = State.get(getOperand(0));
Type *VecTy = Op0->getType();
Expand Down
7 changes: 3 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -396,10 +396,9 @@ static bool mergeBlocksIntoPredecessors(VPlan &Plan) {
auto *ParentRegion = cast_or_null<VPRegionBlock>(VPBB->getParent());
if (ParentRegion && ParentRegion->getExiting() == VPBB)
ParentRegion->setExiting(PredVPBB);
for (auto *Succ : to_vector(VPBB->successors())) {
VPBlockUtils::disconnectBlocks(VPBB, Succ);
VPBlockUtils::connectBlocks(PredVPBB, Succ);
}
for (auto *Succ : to_vector(VPBB->successors()))
VPBlockUtils::replacePredecessor(VPBB, PredVPBB, Succ);

// VPBB is now dead and will be cleaned up when the plan gets destroyed.
}
return !WorkList.empty();
Expand Down
15 changes: 11 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ class VPBlockUtils {
public:
VPBlockUtils() = delete;

/// Disconnect \p OldPred from \p Succ and connect \p NewPred to \p Succ
/// instead, but also swaping phi operands in the successor if necessary.
static void replacePredecessor(VPBlockBase *OldPred, VPBlockBase *NewPred,
VPBlockBase *Succ) {
Succ->replacePredecessor(OldPred, NewPred);
OldPred->removeSuccessor(Succ);
NewPred->appendSuccessor(Succ);
}

/// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
/// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
/// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
Expand All @@ -91,10 +100,8 @@ class VPBlockUtils {
"Can't insert new block with predecessors or successors.");
NewBlock->setParent(BlockPtr->getParent());
SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
for (VPBlockBase *Succ : Succs) {
disconnectBlocks(BlockPtr, Succ);
connectBlocks(NewBlock, Succ);
}
for (VPBlockBase *Succ : Succs)
replacePredecessor(BlockPtr, NewBlock, Succ);
connectBlocks(BlockPtr, NewBlock);
}

Expand Down
Loading