Skip to content

Commit 85721ea

Browse files
committed
[VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes to replace the VPInstructions with widened recipes. At the moment, widened recipes are created based on the underlying instruction of the VPInstruction. Masks are also still created based on the input IR basic blocks and the loop CFG is flattened in the main loop processing the VPInstructions. This patch also incldues support for Switch instructions in HCFGBuilder using just a VPInstruction with Instruction::Switch opcode. There are multiple follow-ups planned: * Use VPIRInstructions instead of VPInstructions in HCFGBuilder, * Perform predication on the VPlan directly, * Unify code constructing VPlan 0 to be shared by both inner and outer loop code paths.
1 parent 7469032 commit 85721ea

File tree

5 files changed

+108
-32
lines changed

5 files changed

+108
-32
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8309,7 +8309,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
83098309
: GEPNoWrapFlags::none(),
83108310
I->getDebugLoc());
83118311
}
8312-
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8312+
VectorPtr->insertBefore(&*Builder.getInsertPoint());
83138313
Ptr = VectorPtr;
83148314
}
83158315
if (LoadInst *Load = dyn_cast<LoadInst>(I))
@@ -9221,6 +9221,7 @@ static void addExitUsersForFirstOrderRecurrences(
92219221
VPlanPtr
92229222
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92239223

9224+
using namespace llvm::VPlanPatternMatch;
92249225
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
92259226

92269227
// ---------------------------------------------------------------------------
@@ -9244,6 +9245,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92449245
PSE, RequiresScalarEpilogueCheck,
92459246
CM.foldTailByMasking(), OrigLoop);
92469247

9248+
// Build hierarchical CFG.
9249+
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9250+
HCFGBuilder.buildHierarchicalCFG();
9251+
92479252
// Don't use getDecisionAndClampRange here, because we don't know the UF
92489253
// so this function is better to be conservative, rather than to split
92499254
// it up into different VPlans.
@@ -9312,23 +9317,45 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93129317
RecipeBuilder.collectScaledReductions(Range);
93139318

93149319
auto *MiddleVPBB = Plan->getMiddleBlock();
9320+
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
9321+
Plan->getVectorLoopRegion()->getEntry());
9322+
93159323
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9316-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9317-
// Relevant instructions from basic block BB will be grouped into VPRecipe
9318-
// ingredients and fill a new VPBasicBlock.
9319-
if (VPBB != HeaderVPBB)
9320-
VPBB->setName(BB->getName());
9321-
Builder.setInsertPoint(VPBB);
9324+
VPBlockBase *PrevVPBB = nullptr;
9325+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
9326+
// Skip VPBBs not corresponding to any input IR basic blocks.
9327+
if (!HCFGBuilder.getIRBBForVPB(VPBB))
9328+
continue;
93229329

9323-
if (VPBB == HeaderVPBB)
9330+
// Create mask based on the IR BB corresponding to VPBB.
9331+
// TODO: Predicate directly based on VPlan.
9332+
if (VPBB == HeaderVPBB) {
9333+
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
93249334
RecipeBuilder.createHeaderMask();
9325-
else if (NeedsMasks)
9326-
RecipeBuilder.createBlockInMask(BB);
9335+
} else if (NeedsMasks) {
9336+
Builder.setInsertPoint(VPBB, VPBB->begin());
9337+
RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
9338+
}
93279339

9328-
// Introduce each ingredient into VPlan.
9340+
// Convert input VPInstructions to widened recipes.
93299341
// TODO: Model and preserve debug intrinsics in VPlan.
9330-
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9331-
Instruction *Instr = &I;
9342+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
9343+
auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
9344+
if (!isa<VPWidenPHIRecipe>(&R) &&
9345+
(!isa<VPInstruction>(SingleDef) || !SingleDef->getUnderlyingValue()))
9346+
continue;
9347+
9348+
if (match(&R, m_BranchOnCond(m_VPValue())) ||
9349+
(isa<VPInstruction>(&R) &&
9350+
cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch)) {
9351+
R.eraseFromParent();
9352+
break;
9353+
}
9354+
9355+
// TODO: Gradually replace uses of underlying instruction by analyses on
9356+
// VPlan.
9357+
Instruction *Instr = SingleDef->getUnderlyingInstr();
9358+
Builder.setInsertPoint(SingleDef);
93329359
SmallVector<VPValue *, 4> Operands;
93339360
auto *Phi = dyn_cast<PHINode>(Instr);
93349361
if (Phi && Phi->getParent() == HeaderBB) {
@@ -9343,15 +9370,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93439370
// in the exit block, a uniform store recipe will be created for the final
93449371
// invariant store of the reduction.
93459372
StoreInst *SI;
9346-
if ((SI = dyn_cast<StoreInst>(&I)) &&
9373+
if ((SI = dyn_cast<StoreInst>(Instr)) &&
93479374
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
93489375
// Only create recipe for the final invariant store of the reduction.
9349-
if (!Legal->isInvariantStoreOfReduction(SI))
9376+
if (!Legal->isInvariantStoreOfReduction(SI)) {
9377+
R.eraseFromParent();
93509378
continue;
9379+
}
93519380
auto *Recipe = new VPReplicateRecipe(
93529381
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
93539382
true /* IsUniform */);
93549383
Recipe->insertBefore(*MiddleVPBB, MBIP);
9384+
R.eraseFromParent();
93559385
continue;
93569386
}
93579387

@@ -9370,16 +9400,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93709400
// after them)
93719401
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
93729402

9373-
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9374-
CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9375-
"unexpected recipe needs moving");
93769403
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
93779404
} else
9378-
VPBB->appendRecipe(Recipe);
9405+
Recipe->insertBefore(&R);
9406+
if (Recipe->getNumDefinedValues() == 1)
9407+
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
9408+
else
9409+
assert(Recipe->getNumDefinedValues() == 0);
9410+
R.eraseFromParent();
93799411
}
93809412

9381-
VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9382-
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9413+
// Flatten the CFG in the loop. Masks for blocks have already been generated
9414+
// and added to recipes as needed. To do so, first disconnect VPBB from its
9415+
// predecessors and successors, except the exiting block. Then connect VPBB
9416+
// to the previously visited VPBB.
9417+
for (auto *Succ : to_vector(VPBB->getSuccessors())) {
9418+
if (Succ == Plan->getVectorLoopRegion()->getExiting())
9419+
continue;
9420+
VPBlockUtils::disconnectBlocks(VPBB, Succ);
9421+
}
9422+
for (auto *Pred : to_vector(VPBB->getPredecessors()))
9423+
VPBlockUtils::disconnectBlocks(Pred, VPBB);
9424+
if (PrevVPBB)
9425+
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
9426+
PrevVPBB = VPBB;
93839427
}
93849428

93859429
// After here, VPBB should not be used.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -587,9 +587,11 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
587587
}
588588

589589
const VPRecipeBase *R = &VPBB->back();
590-
bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
591-
match(R, m_BranchOnCond(m_VPValue())) ||
592-
match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
590+
bool IsCondBranch =
591+
isa<VPBranchOnMaskRecipe>(R) || match(R, m_BranchOnCond(m_VPValue())) ||
592+
match(R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
593+
(isa<VPInstruction>(R) &&
594+
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch);
593595
(void)IsCondBranch;
594596

595597
if (VPBB->getNumSuccessors() >= 2 ||

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
7575
: TheLoop(Lp), LI(LI), Plan(P) {}
7676

7777
/// Build plain CFG for TheLoop and connects it to Plan's entry.
78-
void buildPlainCFG();
78+
void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
7979
};
8080
} // anonymous namespace
8181

@@ -238,9 +238,9 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
238238
return false;
239239

240240
// Check whether Instruction definition is in the loop exit.
241-
BasicBlock *Exit = TheLoop->getUniqueExitBlock();
242-
assert(Exit && "Expected loop with single exit.");
243-
if (InstParent == Exit) {
241+
SmallVector<BasicBlock *> ExitBlocks;
242+
TheLoop->getExitBlocks(ExitBlocks);
243+
if (is_contained(ExitBlocks, InstParent)) {
244244
// Instruction definition is in outermost loop exit.
245245
return false;
246246
}
@@ -308,6 +308,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
308308
continue;
309309
}
310310

311+
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
312+
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
313+
for (auto Case : SI->cases())
314+
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
315+
VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
316+
continue;
317+
}
318+
311319
VPValue *NewVPV;
312320
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
313321
// Phi node's operands may have not been visited at this point. We create
@@ -334,7 +342,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
334342
}
335343

336344
// Main interface to build the plain CFG.
337-
void PlainCFGBuilder::buildPlainCFG() {
345+
void PlainCFGBuilder::buildPlainCFG(
346+
DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
338347
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
339348
// skeleton. These were created directly rather than via getOrCreateVPBB(),
340349
// revisit them now to update BB2VPBB. Note that header/entry and
@@ -423,6 +432,14 @@ void PlainCFGBuilder::buildPlainCFG() {
423432
// Set VPBB successors. We create empty VPBBs for successors if they don't
424433
// exist already. Recipes will be created when the successor is visited
425434
// during the RPO traversal.
435+
if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
436+
SmallVector<VPBlockBase *> Succs = {
437+
getOrCreateVPBB(SI->getDefaultDest())};
438+
for (auto Case : SI->cases())
439+
Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
440+
VPBB->setSuccessors(Succs);
441+
continue;
442+
}
426443
auto *BI = cast<BranchInst>(BB->getTerminator());
427444
unsigned NumSuccs = succ_size(BB);
428445
if (NumSuccs == 1) {
@@ -476,11 +493,14 @@ void PlainCFGBuilder::buildPlainCFG() {
476493
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
477494
// VPlan operands.
478495
fixPhiNodes();
496+
497+
for (const auto &[IRBB, VPB] : BB2VPBB)
498+
VPB2IRBB[VPB] = IRBB;
479499
}
480500

481501
void VPlanHCFGBuilder::buildPlainCFG() {
482502
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
483-
PCFGBuilder.buildPlainCFG();
503+
PCFGBuilder.buildPlainCFG(VPB2IRBB);
484504
}
485505

486506
// Public interface to build a H-CFG.

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
5353
// are introduced.
5454
VPDominatorTree VPDomTree;
5555

56+
/// Map of create VP blocks to their input IR basic blocks, if they have been
57+
/// created for a input IR basic block.
58+
DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
59+
5660
/// Build plain CFG for TheLoop and connects it to Plan's entry.
5761
void buildPlainCFG();
5862

@@ -62,6 +66,12 @@ class VPlanHCFGBuilder {
6266

6367
/// Build H-CFG for TheLoop and update Plan accordingly.
6468
void buildHierarchicalCFG();
69+
70+
/// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
71+
/// there is no such corresponding block.
72+
BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
73+
return VPB2IRBB.lookup(VPB);
74+
}
6575
};
6676
} // namespace llvm
6777

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4646
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4747
; CHECK-NEXT: LV: Using user VF vscale x 4.
4848
; CHECK-NEXT: LV: Loop does not require scalar epilogue
49-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
49+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5050
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5151
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
5252
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
295295
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
296296
; CHECK-NEXT: LV: Using user VF vscale x 4.
297297
; CHECK-NEXT: LV: Loop does not require scalar epilogue
298-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
298+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
299299
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
300300
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
301301
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom

0 commit comments

Comments
 (0)