Skip to content

Commit 38376de

Browse files
authored
[VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC) (#124432)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes to replace the VPInstructions with widened recipes. At the moment, widened recipes are created based on the underlying instruction of the VPInstruction. Masks are also still created based on the input IR basic blocks and the loop CFG is flattened in the main loop processing the VPInstructions. This patch also incldues support for Switch instructions in HCFGBuilder using just a VPInstruction with Instruction::Switch opcode. There are multiple follow-ups planned: * Perform predication on the VPlan directly, * Unify code constructing VPlan 0 to be shared by both inner and outer loop code paths. * Construct VPlan 0 once, clone subsequent ones for VFs PR: #124432
1 parent 9516f44 commit 38376de

File tree

5 files changed

+149
-57
lines changed

5 files changed

+149
-57
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 94 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -9298,6 +9298,7 @@ static void addExitUsersForFirstOrderRecurrences(
92989298
VPlanPtr
92999299
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93009300

9301+
using namespace llvm::VPlanPatternMatch;
93019302
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
93029303

93039304
// ---------------------------------------------------------------------------
@@ -9321,6 +9322,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93219322
PSE, RequiresScalarEpilogueCheck,
93229323
CM.foldTailByMasking(), OrigLoop);
93239324

9325+
// Build hierarchical CFG.
9326+
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9327+
HCFGBuilder.buildHierarchicalCFG();
9328+
93249329
// Don't use getDecisionAndClampRange here, because we don't know the UF
93259330
// so this function is better to be conservative, rather than to split
93269331
// it up into different VPlans.
@@ -9371,12 +9376,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93719376
// Construct recipes for the instructions in the loop
93729377
// ---------------------------------------------------------------------------
93739378

9374-
// Scan the body of the loop in a topological order to visit each basic block
9375-
// after having visited its predecessor basic blocks.
9376-
LoopBlocksDFS DFS(OrigLoop);
9377-
DFS.perform(LI);
9378-
9379-
VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
9379+
VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
9380+
VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
93809381
VPBasicBlock *VPBB = HeaderVPBB;
93819382
BasicBlock *HeaderBB = OrigLoop->getHeader();
93829383
bool NeedsMasks =
@@ -9389,26 +9390,70 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93899390
RecipeBuilder.collectScaledReductions(Range);
93909391

93919392
auto *MiddleVPBB = Plan->getMiddleBlock();
9393+
9394+
// Scan the body of the loop in a topological order to visit each basic block
9395+
// after having visited its predecessor basic blocks.
9396+
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
9397+
HeaderVPBB);
9398+
93929399
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9393-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9394-
// Relevant instructions from basic block BB will be grouped into VPRecipe
9395-
// ingredients and fill a new VPBasicBlock.
9396-
if (VPBB != HeaderVPBB)
9397-
VPBB->setName(BB->getName());
9398-
Builder.setInsertPoint(VPBB);
9400+
VPBlockBase *PrevVPBB = nullptr;
9401+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
9402+
// Handle VPBBs down to the latch.
9403+
if (VPBB == LoopRegion->getExiting()) {
9404+
assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
9405+
"the latch block shouldn't have a corresponding IRBB");
9406+
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
9407+
break;
9408+
}
93999409

9400-
if (VPBB == HeaderVPBB)
9410+
// Create mask based on the IR BB corresponding to VPBB.
9411+
// TODO: Predicate directly based on VPlan.
9412+
Builder.setInsertPoint(VPBB, VPBB->begin());
9413+
if (VPBB == HeaderVPBB) {
9414+
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
94019415
RecipeBuilder.createHeaderMask();
9402-
else if (NeedsMasks)
9403-
RecipeBuilder.createBlockInMask(BB);
9416+
} else if (NeedsMasks) {
9417+
// FIXME: At the moment, masks need to be placed at the beginning of the
9418+
// block, as blends introduced for phi nodes need to use it. The created
9419+
// blends should be sunk after the mask recipes.
9420+
RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
9421+
}
9422+
9423+
// Convert input VPInstructions to widened recipes.
9424+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
9425+
auto *SingleDef = cast<VPSingleDefRecipe>(&R);
9426+
auto *UnderlyingValue = SingleDef->getUnderlyingValue();
9427+
// Skip recipes that do not need transforming, including canonical IV,
9428+
// wide canonical IV and VPInstructions without underlying values. The
9429+
// latter are added above for masking.
9430+
// FIXME: Migrate code relying on the underlying instruction from VPlan0
9431+
// to construct recipes below to not use the underlying instruction.
9432+
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
9433+
(isa<VPInstruction>(&R) && !UnderlyingValue))
9434+
continue;
94049435

9405-
// Introduce each ingredient into VPlan.
9406-
// TODO: Model and preserve debug intrinsics in VPlan.
9407-
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9408-
Instruction *Instr = &I;
9436+
// FIXME: VPlan0, which models a copy of the original scalar loop, should
9437+
// not use VPWidenPHIRecipe to model the phis.
9438+
assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
9439+
UnderlyingValue && "unsupported recipe");
9440+
9441+
if (isa<VPInstruction>(&R) &&
9442+
(cast<VPInstruction>(&R)->getOpcode() ==
9443+
VPInstruction::BranchOnCond ||
9444+
(cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
9445+
R.eraseFromParent();
9446+
break;
9447+
}
9448+
9449+
// TODO: Gradually replace uses of underlying instruction by analyses on
9450+
// VPlan.
9451+
Instruction *Instr = cast<Instruction>(UnderlyingValue);
9452+
Builder.setInsertPoint(SingleDef);
94099453
SmallVector<VPValue *, 4> Operands;
94109454
auto *Phi = dyn_cast<PHINode>(Instr);
94119455
if (Phi && Phi->getParent() == HeaderBB) {
9456+
// The backedge value will be added in fixHeaderPhis later.
94129457
Operands.push_back(Plan->getOrAddLiveIn(
94139458
Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
94149459
} else {
@@ -9420,15 +9465,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
94209465
// in the exit block, a uniform store recipe will be created for the final
94219466
// invariant store of the reduction.
94229467
StoreInst *SI;
9423-
if ((SI = dyn_cast<StoreInst>(&I)) &&
9468+
if ((SI = dyn_cast<StoreInst>(Instr)) &&
94249469
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
94259470
// Only create recipe for the final invariant store of the reduction.
9426-
if (!Legal->isInvariantStoreOfReduction(SI))
9427-
continue;
9428-
auto *Recipe = new VPReplicateRecipe(
9429-
SI, make_range(Operands.begin(), Operands.end()),
9430-
true /* IsUniform */);
9431-
Recipe->insertBefore(*MiddleVPBB, MBIP);
9471+
if (Legal->isInvariantStoreOfReduction(SI)) {
9472+
auto *Recipe = new VPReplicateRecipe(
9473+
SI, make_range(Operands.begin(), Operands.end()),
9474+
true /* IsUniform */);
9475+
Recipe->insertBefore(*MiddleVPBB, MBIP);
9476+
}
9477+
R.eraseFromParent();
94329478
continue;
94339479
}
94349480

@@ -9438,25 +9484,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
94389484
Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
94399485

94409486
RecipeBuilder.setRecipe(Instr, Recipe);
9441-
if (isa<VPHeaderPHIRecipe>(Recipe)) {
9442-
// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
9443-
// the following cases, VPHeaderPHIRecipes may be created after non-phi
9444-
// recipes and need to be moved to the phi section of HeaderVPBB:
9445-
// * tail-folding (non-phi recipes computing the header mask are
9446-
// introduced earlier than regular header phi recipes, and should appear
9447-
// after them)
9448-
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
9449-
9450-
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9451-
CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9452-
"unexpected recipe needs moving");
9487+
if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
9488+
// Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
9489+
// moved to the phi section in the header.
94539490
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9454-
} else
9455-
VPBB->appendRecipe(Recipe);
9456-
}
9457-
9458-
VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9459-
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9491+
} else {
9492+
Builder.insert(Recipe);
9493+
}
9494+
if (Recipe->getNumDefinedValues() == 1)
9495+
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
9496+
else
9497+
assert(Recipe->getNumDefinedValues() == 0 &&
9498+
"Unexpected multidef recipe");
9499+
R.eraseFromParent();
9500+
}
9501+
9502+
// Flatten the CFG in the loop. Masks for blocks have already been generated
9503+
// and added to recipes as needed. To do so, first disconnect VPBB from its
9504+
// successors. Then connect VPBB to the previously visited VPBB.
9505+
for (auto *Succ : to_vector(VPBB->getSuccessors()))
9506+
VPBlockUtils::disconnectBlocks(VPBB, Succ);
9507+
if (PrevVPBB)
9508+
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
9509+
PrevVPBB = VPBB;
94609510
}
94619511

94629512
// After here, VPBB should not be used.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -600,16 +600,25 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
600600
}
601601

602602
const VPRecipeBase *R = &VPBB->back();
603+
bool IsSwitch = isa<VPInstruction>(R) &&
604+
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
603605
bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
604606
match(R, m_BranchOnCond(m_VPValue())) ||
605607
match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
606608
(void)IsCondBranch;
607-
608-
if (VPBB->getNumSuccessors() >= 2 ||
609+
(void)IsSwitch;
610+
if (VPBB->getNumSuccessors() == 2 ||
609611
(VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
610-
assert(IsCondBranch && "block with multiple successors not terminated by "
611-
"conditional branch recipe");
612+
assert((IsCondBranch || IsSwitch) &&
613+
"block with multiple successors not terminated by "
614+
"conditional branch nor switch recipe");
615+
616+
return true;
617+
}
612618

619+
if (VPBB->getNumSuccessors() > 2) {
620+
assert(IsSwitch && "block with more than 2 successors not terminated by "
621+
"a switch recipe");
613622
return true;
614623
}
615624

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
7575
: TheLoop(Lp), LI(LI), Plan(P) {}
7676

7777
/// Build plain CFG for TheLoop and connects it to Plan's entry.
78-
void buildPlainCFG();
78+
void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
7979
};
8080
} // anonymous namespace
8181

@@ -242,10 +242,10 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
242242
// Instruction definition is in outermost loop PH.
243243
return false;
244244

245-
// Check whether Instruction definition is in the loop exit.
246-
BasicBlock *Exit = TheLoop->getUniqueExitBlock();
247-
assert(Exit && "Expected loop with single exit.");
248-
if (InstParent == Exit) {
245+
// Check whether Instruction definition is in a loop exit.
246+
SmallVector<BasicBlock *> ExitBlocks;
247+
TheLoop->getExitBlocks(ExitBlocks);
248+
if (is_contained(ExitBlocks, InstParent)) {
249249
// Instruction definition is in outermost loop exit.
250250
return false;
251251
}
@@ -288,6 +288,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
288288
void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
289289
BasicBlock *BB) {
290290
VPIRBuilder.setInsertPoint(VPBB);
291+
// TODO: Model and preserve debug intrinsics in VPlan.
291292
for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
292293
Instruction *Inst = &InstRef;
293294

@@ -313,6 +314,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
313314
continue;
314315
}
315316

317+
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
318+
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
319+
for (auto Case : SI->cases())
320+
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
321+
VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
322+
continue;
323+
}
324+
316325
VPValue *NewVPV;
317326
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
318327
// Phi node's operands may have not been visited at this point. We create
@@ -339,7 +348,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
339348
}
340349

341350
// Main interface to build the plain CFG.
342-
void PlainCFGBuilder::buildPlainCFG() {
351+
void PlainCFGBuilder::buildPlainCFG(
352+
DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
343353
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
344354
// skeleton. These were created directly rather than via getOrCreateVPBB(),
345355
// revisit them now to update BB2VPBB. Note that header/entry and
@@ -428,6 +438,14 @@ void PlainCFGBuilder::buildPlainCFG() {
428438
// Set VPBB successors. We create empty VPBBs for successors if they don't
429439
// exist already. Recipes will be created when the successor is visited
430440
// during the RPO traversal.
441+
if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
442+
SmallVector<VPBlockBase *> Succs = {
443+
getOrCreateVPBB(SI->getDefaultDest())};
444+
for (auto Case : SI->cases())
445+
Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
446+
VPBB->setSuccessors(Succs);
447+
continue;
448+
}
431449
auto *BI = cast<BranchInst>(BB->getTerminator());
432450
unsigned NumSuccs = succ_size(BB);
433451
if (NumSuccs == 1) {
@@ -481,11 +499,14 @@ void PlainCFGBuilder::buildPlainCFG() {
481499
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
482500
// VPlan operands.
483501
fixPhiNodes();
502+
503+
for (const auto &[IRBB, VPB] : BB2VPBB)
504+
VPB2IRBB[VPB] = IRBB;
484505
}
485506

486507
void VPlanHCFGBuilder::buildPlainCFG() {
487508
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
488-
PCFGBuilder.buildPlainCFG();
509+
PCFGBuilder.buildPlainCFG(VPB2IRBB);
489510
}
490511

491512
// Public interface to build a H-CFG.

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
5353
// are introduced.
5454
VPDominatorTree VPDomTree;
5555

56+
/// Map of create VP blocks to their input IR basic blocks, if they have been
57+
/// created for a input IR basic block.
58+
DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
59+
5660
/// Build plain CFG for TheLoop and connects it to Plan's entry.
5761
void buildPlainCFG();
5862

@@ -62,6 +66,14 @@ class VPlanHCFGBuilder {
6266

6367
/// Build H-CFG for TheLoop and update Plan accordingly.
6468
void buildHierarchicalCFG();
69+
70+
/// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
71+
/// there is no such corresponding block.
72+
/// FIXME: This is a temporary workaround to drive the createBlockInMask.
73+
/// Remove once mask creation is done on VPlan.
74+
BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
75+
return VPB2IRBB.lookup(VPB);
76+
}
6577
};
6678
} // namespace llvm
6779

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4646
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4747
; CHECK-NEXT: LV: Using user VF vscale x 4.
4848
; CHECK-NEXT: LV: Loop does not require scalar epilogue
49-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
49+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5050
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5151
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
5252
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
295295
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
296296
; CHECK-NEXT: LV: Using user VF vscale x 4.
297297
; CHECK-NEXT: LV: Loop does not require scalar epilogue
298-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
298+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
299299
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
300300
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
301301
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom

0 commit comments

Comments
 (0)