|
59 | 59 | #include "VPlan.h"
|
60 | 60 | #include "VPlanAnalysis.h"
|
61 | 61 | #include "VPlanHCFGBuilder.h"
|
| 62 | +#include "VPlanPatternMatch.h" |
62 | 63 | #include "VPlanTransforms.h"
|
63 | 64 | #include "VPlanVerifier.h"
|
64 | 65 | #include "llvm/ADT/APInt.h"
|
@@ -1652,10 +1653,6 @@ class LoopVectorizationCostModel {
|
1652 | 1653 | /// of elements.
|
1653 | 1654 | ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
|
1654 | 1655 |
|
1655 |
| - /// Returns the execution time cost of an instruction for a given vector |
1656 |
| - /// width. Vector width of one means scalar. |
1657 |
| - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); |
1658 |
| - |
1659 | 1656 | /// The cost-computation logic from getInstructionCost which provides
|
1660 | 1657 | /// the vector type as an output parameter.
|
1661 | 1658 | InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
|
@@ -1819,6 +1816,10 @@ class LoopVectorizationCostModel {
|
1819 | 1816 | }
|
1820 | 1817 |
|
1821 | 1818 | public:
|
| 1819 | + /// Returns the execution time cost of an instruction for a given vector |
| 1820 | + /// width. Vector width of one means scalar. |
| 1821 | + VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); |
| 1822 | + |
1822 | 1823 | /// The loop that we evaluate.
|
1823 | 1824 | Loop *TheLoop;
|
1824 | 1825 |
|
@@ -7395,6 +7396,177 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
7395 | 7396 | return VF;
|
7396 | 7397 | }
|
7397 | 7398 |
|
| 7399 | +static InstructionCost |
| 7400 | +computeCostForRecipe(VPRecipeBase *R, ElementCount VF, |
| 7401 | + SmallPtrSetImpl<Instruction *> &SeenUI, |
| 7402 | + LoopVectorizationCostModel &CM, |
| 7403 | + const TargetTransformInfo &TTI, VPCostContext CostCtx) { |
| 7404 | + Instruction *UI = nullptr; |
| 7405 | + if (auto *S = dyn_cast<VPSingleDefRecipe>(R)) |
| 7406 | + UI = dyn_cast_or_null<Instruction>(S->getUnderlyingValue()); |
| 7407 | + if (UI && (CM.VecValuesToIgnore.contains(UI) || !SeenUI.insert(UI).second)) |
| 7408 | + return 0; |
| 7409 | + |
| 7410 | + InstructionCost RecipeCost = R->computeCost(VF, CostCtx); |
| 7411 | + if (!RecipeCost.isValid()) { |
| 7412 | + if (auto *IG = dyn_cast<VPInterleaveRecipe>(R)) { |
| 7413 | + RecipeCost = CM.getInstructionCost(IG->getInsertPos(), VF).first; |
| 7414 | + } else if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R)) { |
| 7415 | + RecipeCost = CM.getInstructionCost(&WidenMem->getIngredient(), VF).first; |
| 7416 | + } else if (UI) { |
| 7417 | + RecipeCost = CM.getInstructionCost(UI, VF).first; |
| 7418 | + } else |
| 7419 | + return 0; |
| 7420 | + } |
| 7421 | + if (ForceTargetInstructionCost.getNumOccurrences() > 0 && |
| 7422 | + RecipeCost.isValid()) |
| 7423 | + RecipeCost = InstructionCost(ForceTargetInstructionCost); |
| 7424 | + |
| 7425 | + LLVM_DEBUG({ |
| 7426 | + dbgs() << "Cost of " << RecipeCost << " for VF " << VF << ": "; |
| 7427 | + R->dump(); |
| 7428 | + }); |
| 7429 | + return RecipeCost; |
| 7430 | +} |
| 7431 | + |
| 7432 | +static InstructionCost computeCostForReplicatorRegion( |
| 7433 | + VPRegionBlock *Region, ElementCount VF, |
| 7434 | + SmallPtrSetImpl<Instruction *> &SeenUI, LoopVectorizationCostModel &CM, |
| 7435 | + const TargetTransformInfo &TTI, LLVMContext &Ctx, VPCostContext CostCtx) { |
| 7436 | + using namespace llvm::VPlanPatternMatch; |
| 7437 | + InstructionCost RegionCost = 0; |
| 7438 | + assert(Region->isReplicator() && |
| 7439 | + "can only compute cost for a replicator region"); |
| 7440 | + VPBasicBlock *Then = |
| 7441 | + cast<VPBasicBlock>(Region->getEntry()->getSuccessors()[0]); |
| 7442 | + for (VPRecipeBase &R : *Then) |
| 7443 | + RegionCost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx); |
| 7444 | + |
| 7445 | + // Note the cost estimates below closely match the current legacy cost model. |
| 7446 | + auto *BOM = |
| 7447 | + cast<VPBranchOnMaskRecipe>(&Region->getEntryBasicBlock()->front()); |
| 7448 | + VPValue *Cond = BOM->getOperand(0); |
| 7449 | + |
| 7450 | + // Check if Cond is a uniform compare. |
| 7451 | + auto IsUniformCompare = [Cond]() { |
| 7452 | + VPValue *Op = Cond; |
| 7453 | + if (match(Op, m_Not(m_VPValue()))) |
| 7454 | + Op = Op->getDefiningRecipe()->getOperand(0); |
| 7455 | + auto *R = Op->getDefiningRecipe(); |
| 7456 | + if (!R) |
| 7457 | + return true; |
| 7458 | + if (!match(R, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue()))) |
| 7459 | + return false; |
| 7460 | + return all_of(R->operands(), [](VPValue *Op) { |
| 7461 | + return vputils::isUniformAfterVectorization(Op); |
| 7462 | + }); |
| 7463 | + }(); |
| 7464 | + bool IsHeaderMaskOrUniformCond = |
| 7465 | + IsUniformCompare || |
| 7466 | + match(Cond, m_ActiveLaneMask(m_VPValue(), m_VPValue())) || |
| 7467 | + match(Cond, m_Binary<Instruction::ICmp>(m_VPValue(), m_VPValue())) || |
| 7468 | + isa<VPActiveLaneMaskPHIRecipe>(Cond); |
| 7469 | + if (IsHeaderMaskOrUniformCond || VF.isScalable()) |
| 7470 | + return RegionCost; |
| 7471 | + |
| 7472 | + // For the scalar case, we may not always execute the original predicated |
| 7473 | + // block, Thus, scale the block's cost by the probability of executing it. |
| 7474 | + // blockNeedsPredication from Legal is used so as to not include all blocks in |
| 7475 | + // tail folded loops. |
| 7476 | + if (VF.isScalar()) |
| 7477 | + return RegionCost / getReciprocalPredBlockProb(); |
| 7478 | + |
| 7479 | + // Add the cost for branches around scalarized and predicated blocks. |
| 7480 | + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 7481 | + auto *Vec_i1Ty = VectorType::get(IntegerType::getInt1Ty(Ctx), VF); |
| 7482 | + return RegionCost + |
| 7483 | + TTI.getScalarizationOverhead( |
| 7484 | + Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), |
| 7485 | + /*Insert*/ false, /*Extract*/ true, CostKind) + |
| 7486 | + (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()); |
| 7487 | +} |
| 7488 | + |
| 7489 | +InstructionCost LoopVectorizationPlanner::computeCost(VPlan &Plan, |
| 7490 | + ElementCount VF) { |
| 7491 | + InstructionCost Cost = 0; |
| 7492 | + SmallPtrSet<Instruction *, 8> SeenUI; |
| 7493 | + LLVMContext &Ctx = OrigLoop->getHeader()->getContext(); |
| 7494 | + VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), Ctx); |
| 7495 | + |
| 7496 | + // Cost modeling for inductions is inaccurate in the legacy cost model |
| 7497 | + // compared to the recipes that are generated. To match here initially during |
| 7498 | + // VPlan cost model bring up directly use the induction costs from the legacy |
| 7499 | + // cost model and skip induction recipes. |
| 7500 | + for (const auto &[IV, _] : Legal->getInductionVars()) { |
| 7501 | + Instruction *IVInc = cast<Instruction>( |
| 7502 | + IV->getIncomingValueForBlock(OrigLoop->getLoopLatch())); |
| 7503 | + InstructionCost RecipeCost = CM.getInstructionCost(IVInc, VF).first; |
| 7504 | + LLVM_DEBUG({ |
| 7505 | + dbgs() << "Cost of " << RecipeCost << " for VF " << VF |
| 7506 | + << ":\n induction increment "; |
| 7507 | + IVInc->dump(); |
| 7508 | + }); |
| 7509 | + Cost += RecipeCost; |
| 7510 | + SeenUI.insert(IVInc); |
| 7511 | + } |
| 7512 | + |
| 7513 | + VPBasicBlock *Header = |
| 7514 | + cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getEntry()); |
| 7515 | + for (VPBlockBase *Block : to_vector(vp_depth_first_shallow(Header))) { |
| 7516 | + if (auto *Region = dyn_cast<VPRegionBlock>(Block)) { |
| 7517 | + Cost += computeCostForReplicatorRegion(Region, VF, SeenUI, CM, CM.TTI, |
| 7518 | + Ctx, CostCtx); |
| 7519 | + continue; |
| 7520 | + } |
| 7521 | + |
| 7522 | + for (VPRecipeBase &R : *cast<VPBasicBlock>(Block)) |
| 7523 | + Cost += computeCostForRecipe(&R, VF, SeenUI, CM, CM.TTI, CostCtx); |
| 7524 | + } |
| 7525 | + |
| 7526 | + // Add the cost for the backedge. |
| 7527 | + Cost += 1; |
| 7528 | + LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n"); |
| 7529 | + return Cost; |
| 7530 | +} |
| 7531 | + |
| 7532 | +std::pair<VPlan &, ElementCount> LoopVectorizationPlanner::getBestPlan() { |
| 7533 | + // If there is a single VPlan with a single VF, return it directly. |
| 7534 | + if (VPlans.size() == 1 && size(VPlans[0]->vectorFactors()) == 1) { |
| 7535 | + ElementCount VF = *VPlans[0]->vectorFactors().begin(); |
| 7536 | + return {*VPlans[0], VF}; |
| 7537 | + } |
| 7538 | + |
| 7539 | + VPlan *BestPlan = &*VPlans[0]; |
| 7540 | + assert(hasPlanWithVF(ElementCount::getFixed(1))); |
| 7541 | + ElementCount BestVF = ElementCount::getFixed(1); |
| 7542 | + |
| 7543 | + InstructionCost ScalarCost = computeCost( |
| 7544 | + getBestPlanFor(ElementCount::getFixed(1)), ElementCount::getFixed(1)); |
| 7545 | + InstructionCost BestCost = ScalarCost; |
| 7546 | + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; |
| 7547 | + if (ForceVectorization) { |
| 7548 | + // Ignore scalar width, because the user explicitly wants vectorization. |
| 7549 | + // Initialize cost to max so that VF = 2 is, at least, chosen during cost |
| 7550 | + // evaluation. |
| 7551 | + BestCost = InstructionCost::getMax(); |
| 7552 | + } |
| 7553 | + |
| 7554 | + for (auto &P : VPlans) { |
| 7555 | + for (ElementCount VF : P->vectorFactors()) { |
| 7556 | + if (VF.isScalar()) |
| 7557 | + continue; |
| 7558 | + InstructionCost Cost = computeCost(*P, VF); |
| 7559 | + if (isMoreProfitable(VectorizationFactor(VF, Cost, ScalarCost), |
| 7560 | + VectorizationFactor(BestVF, BestCost, ScalarCost))) { |
| 7561 | + BestCost = Cost; |
| 7562 | + BestVF = VF; |
| 7563 | + BestPlan = &*P; |
| 7564 | + } |
| 7565 | + } |
| 7566 | + } |
| 7567 | + return {*BestPlan, BestVF}; |
| 7568 | +} |
| 7569 | + |
7398 | 7570 | VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
|
7399 | 7571 | assert(count_if(VPlans,
|
7400 | 7572 | [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
|
@@ -10176,8 +10348,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
|
10176 | 10348 | VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
|
10177 | 10349 | PSI, Checks);
|
10178 | 10350 |
|
10179 |
| - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); |
10180 |
| - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); |
| 10351 | + const auto &[BestPlan, Width] = LVP.getBestPlan(); |
| 10352 | + LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width |
| 10353 | + << "\n"); |
| 10354 | + assert(VF.Width == Width && |
| 10355 | + "VPlan cost model and legacy cost model disagreed"); |
| 10356 | + LVP.executePlan(Width, IC, BestPlan, LB, DT, false); |
10181 | 10357 | ++LoopsVectorized;
|
10182 | 10358 |
|
10183 | 10359 | // Add metadata to disable runtime unrolling a scalar loop when there
|
|
0 commit comments