Skip to content

Commit a858f4f

Browse files
committed
[VPlan] Implement initial vector code generation support for simple outer loops.
Summary: [VPlan] Implement vector code generation support for simple outer loops. Context: Patch Series #1 for outer loop vectorization support in LV using VPlan. (RFC: http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). This patch introduces vector code generation support for simple outer loops that are currently supported in the VPlanNativePath. Changes here essentially do the following: - force vector code generation using explicit vectorize_width - add conservative early returns in cost model and other places for VPlanNativePath - add code for setting up outer loop inductions - support for widening non-induction PHIs that can result from inner loops and uniform conditional branches - support for generating uniform inner branches We plan to add a handful C outer loop executable tests once the initial code generation support is committed. This patch is expected to be NFC for the inner loop vectorizer path. Since we are moving in the direction of supporting outer loop vectorization in LV, it may also be time to rename classes such as InnerLoopVectorizer. Reviewers: fhahn, rengolin, hsaito, dcaballe, mkuper, hfinkel, Ayal Reviewed By: fhahn, hsaito Subscribers: dmgreen, bollu, tschuett, rkruppe, rogfer01, llvm-commits Differential Revision: https://reviews.llvm.org/D50820 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@342197 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 2a6d3ac commit a858f4f

File tree

9 files changed

+486
-15
lines changed

9 files changed

+486
-15
lines changed

include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,11 @@ class LoopVectorizationLegality {
332332
/// If false, good old LV code.
333333
bool canVectorizeLoopNestCFG(Loop *Lp, bool UseVPlanNativePath);
334334

335+
/// Set up outer loop inductions by checking Phis in outer loop header for
336+
/// supported inductions (int inductions). Return false if any of these Phis
337+
/// is not a supported induction or if we fail to find an induction.
338+
bool setupOuterLoopInductions();
339+
335340
/// Return true if the pre-header, exiting and latch blocks of \p Lp
336341
/// (non-recursive) are considered legal for vectorization.
337342
/// Temporarily taking UseVPlanNativePath parameter. If true, take

lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,18 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
516516
return false;
517517
}
518518

519+
// Check whether we are able to set up outer loop induction.
520+
if (!setupOuterLoopInductions()) {
521+
LLVM_DEBUG(
522+
dbgs() << "LV: Not vectorizing: Unsupported outer loop Phi(s).\n");
523+
ORE->emit(createMissedAnalysis("UnsupportedPhi")
524+
<< "Unsupported outer loop Phi(s)");
525+
if (DoExtraAnalysis)
526+
Result = false;
527+
else
528+
return false;
529+
}
530+
519531
return Result;
520532
}
521533

@@ -571,6 +583,32 @@ void LoopVectorizationLegality::addInductionPhi(
571583
LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
572584
}
573585

586+
bool LoopVectorizationLegality::setupOuterLoopInductions() {
587+
BasicBlock *Header = TheLoop->getHeader();
588+
589+
// Returns true if a given Phi is a supported induction.
590+
auto isSupportedPhi = [&](PHINode &Phi) -> bool {
591+
InductionDescriptor ID;
592+
if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
593+
ID.getKind() == InductionDescriptor::IK_IntInduction) {
594+
addInductionPhi(&Phi, ID, AllowedExit);
595+
return true;
596+
} else {
597+
// Bail out for any Phi in the outer loop header that is not a supported
598+
// induction.
599+
LLVM_DEBUG(
600+
dbgs()
601+
<< "LV: Found unsupported PHI for outer loop vectorization.\n");
602+
return false;
603+
}
604+
};
605+
606+
if (llvm::all_of(Header->phis(), isSupportedPhi))
607+
return true;
608+
else
609+
return false;
610+
}
611+
574612
bool LoopVectorizationLegality::canVectorizeInstrs() {
575613
BasicBlock *Header = TheLoop->getHeader();
576614

lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 147 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
#include "LoopVectorizationPlanner.h"
5959
#include "VPRecipeBuilder.h"
6060
#include "VPlanHCFGBuilder.h"
61+
#include "VPlanHCFGTransforms.h"
6162
#include "llvm/ADT/APInt.h"
6263
#include "llvm/ADT/ArrayRef.h"
6364
#include "llvm/ADT/DenseMap.h"
@@ -234,7 +235,7 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
234235
cl::desc("The maximum interleave count to use when interleaving a scalar "
235236
"reduction in a nested loop."));
236237

237-
static cl::opt<bool> EnableVPlanNativePath(
238+
cl::opt<bool> EnableVPlanNativePath(
238239
"enable-vplan-native-path", cl::init(false), cl::Hidden,
239240
cl::desc("Enable VPlan-native vectorization path with "
240241
"support for outer loop vectorization."));
@@ -419,6 +420,9 @@ class InnerLoopVectorizer {
419420
/// the instruction.
420421
void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
421422

423+
/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
424+
void fixNonInductionPHIs(void);
425+
422426
protected:
423427
friend class LoopVectorizationPlanner;
424428

@@ -686,6 +690,10 @@ class InnerLoopVectorizer {
686690
// Holds the end values for each induction variable. We save the end values
687691
// so we can later fix-up the external users of the induction variables.
688692
DenseMap<PHINode *, Value *> IVEndValues;
693+
694+
// Vector of original scalar PHIs whose corresponding widened PHIs need to be
695+
// fixed up at the end of vector code generation.
696+
SmallVector<PHINode *, 8> OrigPHIsToFix;
689697
};
690698

691699
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -888,6 +896,12 @@ class LoopVectorizationCostModel {
888896
/// vectorization factor \p VF.
889897
bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
890898
assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
899+
900+
// Cost model is not run in the VPlan-native path - return conservative
901+
// result until this changes.
902+
if (EnableVPlanNativePath)
903+
return false;
904+
891905
auto Scalars = InstsToScalarize.find(VF);
892906
assert(Scalars != InstsToScalarize.end() &&
893907
"VF not yet analyzed for scalarization profitability");
@@ -898,6 +912,12 @@ class LoopVectorizationCostModel {
898912
bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
899913
if (VF == 1)
900914
return true;
915+
916+
// Cost model is not run in the VPlan-native path - return conservative
917+
// result until this changes.
918+
if (EnableVPlanNativePath)
919+
return false;
920+
901921
auto UniformsPerVF = Uniforms.find(VF);
902922
assert(UniformsPerVF != Uniforms.end() &&
903923
"VF not yet analyzed for uniformity");
@@ -908,6 +928,12 @@ class LoopVectorizationCostModel {
908928
bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
909929
if (VF == 1)
910930
return true;
931+
932+
// Cost model is not run in the VPlan-native path - return conservative
933+
// result until this changes.
934+
if (EnableVPlanNativePath)
935+
return false;
936+
911937
auto ScalarsPerVF = Scalars.find(VF);
912938
assert(ScalarsPerVF != Scalars.end() &&
913939
"Scalar values are not calculated for VF");
@@ -962,6 +988,12 @@ class LoopVectorizationCostModel {
962988
/// through the cost modeling.
963989
InstWidening getWideningDecision(Instruction *I, unsigned VF) {
964990
assert(VF >= 2 && "Expected VF >=2");
991+
992+
// Cost model is not run in the VPlan-native path - return conservative
993+
// result until this changes.
994+
if (EnableVPlanNativePath)
995+
return CM_GatherScatter;
996+
965997
std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
966998
auto Itr = WideningDecisions.find(InstOnVF);
967999
if (Itr == WideningDecisions.end())
@@ -1397,8 +1429,16 @@ struct LoopVectorize : public FunctionPass {
13971429
AU.addRequired<LoopAccessLegacyAnalysis>();
13981430
AU.addRequired<DemandedBitsWrapperPass>();
13991431
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1400-
AU.addPreserved<LoopInfoWrapperPass>();
1401-
AU.addPreserved<DominatorTreeWrapperPass>();
1432+
1433+
// We currently do not preserve loopinfo/dominator analyses with outer loop
1434+
// vectorization. Until this is addressed, mark these analyses as preserved
1435+
// only for non-VPlan-native path.
1436+
// TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1437+
if (!EnableVPlanNativePath) {
1438+
AU.addPreserved<LoopInfoWrapperPass>();
1439+
AU.addPreserved<DominatorTreeWrapperPass>();
1440+
}
1441+
14021442
AU.addPreserved<BasicAAWrapperPass>();
14031443
AU.addPreserved<GlobalsAAWrapperPass>();
14041444
}
@@ -1749,8 +1789,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
17491789
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
17501790
assert(!V->getType()->isVoidTy() && "Type does not produce a value");
17511791

1752-
// If we have a stride that is replaced by one, do it here.
1753-
if (Legal->hasStride(V))
1792+
// If we have a stride that is replaced by one, do it here. Defer this for
1793+
// the VPlan-native path until we start running Legal checks in that path.
1794+
if (!EnableVPlanNativePath && Legal->hasStride(V))
17541795
V = ConstantInt::get(V->getType(), 1);
17551796

17561797
// If we have a vector mapped to this value, return it.
@@ -2416,6 +2457,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
24162457
}
24172458

24182459
void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2460+
// VPlan-native path does not do any analysis for runtime checks currently.
2461+
if (EnableVPlanNativePath)
2462+
return;
2463+
24192464
BasicBlock *BB = L->getLoopPreheader();
24202465

24212466
// Generate the code that checks in runtime if arrays overlap. We put the
@@ -3060,6 +3105,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
30603105
if (VF > 1)
30613106
truncateToMinimalBitwidths();
30623107

3108+
// Fix widened non-induction PHIs by setting up the PHI operands.
3109+
if (OrigPHIsToFix.size()) {
3110+
assert(EnableVPlanNativePath &&
3111+
"Unexpected non-induction PHIs for fixup in non VPlan-native path");
3112+
fixNonInductionPHIs();
3113+
}
3114+
30633115
// At this point every instruction in the original loop is widened to a
30643116
// vector form. Now we need to fix the recurrences in the loop. These PHI
30653117
// nodes are currently empty because we did not want to introduce cycles.
@@ -3532,12 +3584,62 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
35323584
} while (Changed);
35333585
}
35343586

3587+
void InnerLoopVectorizer::fixNonInductionPHIs() {
3588+
for (PHINode *OrigPhi : OrigPHIsToFix) {
3589+
PHINode *NewPhi =
3590+
cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
3591+
unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
3592+
3593+
SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
3594+
predecessors(OrigPhi->getParent()));
3595+
SmallVector<BasicBlock *, 2> VectorBBPredecessors(
3596+
predecessors(NewPhi->getParent()));
3597+
assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
3598+
"Scalar and Vector BB should have the same number of predecessors");
3599+
3600+
// The insertion point in Builder may be invalidated by the time we get
3601+
// here. Force the Builder insertion point to something valid so that we do
3602+
// not run into issues during insertion point restore in
3603+
// getOrCreateVectorValue calls below.
3604+
Builder.SetInsertPoint(NewPhi);
3605+
3606+
// The predecessor order is preserved and we can rely on mapping between
3607+
// scalar and vector block predecessors.
3608+
for (unsigned i = 0; i < NumIncomingValues; ++i) {
3609+
BasicBlock *NewPredBB = VectorBBPredecessors[i];
3610+
3611+
// When looking up the new scalar/vector values to fix up, use incoming
3612+
// values from original phi.
3613+
Value *ScIncV =
3614+
OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
3615+
3616+
// Scalar incoming value may need a broadcast
3617+
Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
3618+
NewPhi->addIncoming(NewIncV, NewPredBB);
3619+
}
3620+
}
3621+
}
3622+
35353623
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
35363624
unsigned VF) {
3625+
PHINode *P = cast<PHINode>(PN);
3626+
if (EnableVPlanNativePath) {
3627+
// Currently we enter here in the VPlan-native path for non-induction
3628+
// PHIs where all control flow is uniform. We simply widen these PHIs.
3629+
// Create a vector phi with no operands - the vector phi operands will be
3630+
// set at the end of vector code generation.
3631+
Type *VecTy =
3632+
(VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
3633+
Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
3634+
VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
3635+
OrigPHIsToFix.push_back(P);
3636+
3637+
return;
3638+
}
3639+
35373640
assert(PN->getParent() == OrigLoop->getHeader() &&
35383641
"Non-header phis should have been handled elsewhere");
35393642

3540-
PHINode *P = cast<PHINode>(PN);
35413643
// In order to support recurrences we need to be able to vectorize Phi nodes.
35423644
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
35433645
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
@@ -3893,6 +3995,10 @@ void InnerLoopVectorizer::updateAnalysis() {
38933995
// Forget the original basic block.
38943996
PSE.getSE()->forgetLoop(OrigLoop);
38953997

3998+
// DT is not kept up-to-date for outer loop vectorization
3999+
if (EnableVPlanNativePath)
4000+
return;
4001+
38964002
// Update the dominator tree information.
38974003
assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
38984004
"Entry does not dominate exit.");
@@ -6527,6 +6633,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
65276633
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
65286634
HCFGBuilder.buildHierarchicalCFG();
65296635

6636+
SmallPtrSet<Instruction *, 1> DeadInstructions;
6637+
VPlanHCFGTransforms::VPInstructionsToVPRecipes(
6638+
Plan, Legal->getInductionVars(), DeadInstructions);
6639+
6640+
for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
6641+
Plan->addVF(VF);
6642+
65306643
return Plan;
65316644
}
65326645

@@ -6728,11 +6841,26 @@ static bool processLoopInVPlanNativePath(
67286841
Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
67296842

67306843
// Plan how to best vectorize, return the best VF and its cost.
6731-
LVP.planInVPlanNativePath(OptForSize, UserVF);
6844+
VectorizationFactor VF = LVP.planInVPlanNativePath(OptForSize, UserVF);
67326845

6733-
// Returning false. We are currently not generating vector code in the VPlan
6734-
// native path.
6735-
return false;
6846+
// If we are stress testing VPlan builds, do not attempt to generate vector
6847+
// code.
6848+
if (VPlanBuildStressTest)
6849+
return false;
6850+
6851+
LVP.setBestPlan(VF.Width, 1);
6852+
6853+
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1, LVL,
6854+
&CM);
6855+
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
6856+
<< L->getHeader()->getParent()->getName() << "\"\n");
6857+
LVP.executePlan(LB, DT);
6858+
6859+
// Mark the loop as already vectorized to avoid vectorizing again.
6860+
Hints.setAlreadyVectorized();
6861+
6862+
LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
6863+
return true;
67366864
}
67376865

67386866
bool LoopVectorizePass::processLoop(Loop *L) {
@@ -7123,8 +7251,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
71237251
if (!Changed)
71247252
return PreservedAnalyses::all();
71257253
PreservedAnalyses PA;
7126-
PA.preserve<LoopAnalysis>();
7127-
PA.preserve<DominatorTreeAnalysis>();
7254+
7255+
// We currently do not preserve loopinfo/dominator analyses with outer loop
7256+
// vectorization. Until this is addressed, mark these analyses as preserved
7257+
// only for non-VPlan-native path.
7258+
// TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7259+
if (!EnableVPlanNativePath) {
7260+
PA.preserve<LoopAnalysis>();
7261+
PA.preserve<DominatorTreeAnalysis>();
7262+
}
71287263
PA.preserve<BasicAA>();
71297264
PA.preserve<GlobalsAA>();
71307265
return PA;

0 commit comments

Comments
 (0)