58
58
#include " LoopVectorizationPlanner.h"
59
59
#include " VPRecipeBuilder.h"
60
60
#include " VPlanHCFGBuilder.h"
61
+ #include " VPlanHCFGTransforms.h"
61
62
#include " llvm/ADT/APInt.h"
62
63
#include " llvm/ADT/ArrayRef.h"
63
64
#include " llvm/ADT/DenseMap.h"
@@ -234,7 +235,7 @@ static cl::opt<unsigned> MaxNestedScalarReductionIC(
234
235
cl::desc(" The maximum interleave count to use when interleaving a scalar "
235
236
" reduction in a nested loop." ));
236
237
237
- static cl::opt<bool > EnableVPlanNativePath (
238
+ cl::opt<bool > EnableVPlanNativePath (
238
239
" enable-vplan-native-path" , cl::init(false ), cl::Hidden,
239
240
cl::desc(" Enable VPlan-native vectorization path with "
240
241
" support for outer loop vectorization." ));
@@ -419,6 +420,9 @@ class InnerLoopVectorizer {
419
420
// / the instruction.
420
421
void setDebugLocFromInst (IRBuilder<> &B, const Value *Ptr);
421
422
423
+ // / Fix the non-induction PHIs in the OrigPHIsToFix vector.
424
+ void fixNonInductionPHIs (void );
425
+
422
426
protected:
423
427
friend class LoopVectorizationPlanner ;
424
428
@@ -686,6 +690,10 @@ class InnerLoopVectorizer {
686
690
// Holds the end values for each induction variable. We save the end values
687
691
// so we can later fix-up the external users of the induction variables.
688
692
DenseMap<PHINode *, Value *> IVEndValues;
693
+
694
+ // Vector of original scalar PHIs whose corresponding widened PHIs need to be
695
+ // fixed up at the end of vector code generation.
696
+ SmallVector<PHINode *, 8 > OrigPHIsToFix;
689
697
};
690
698
691
699
class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -888,6 +896,12 @@ class LoopVectorizationCostModel {
888
896
// / vectorization factor \p VF.
889
897
bool isProfitableToScalarize (Instruction *I, unsigned VF) const {
890
898
assert (VF > 1 && " Profitable to scalarize relevant only for VF > 1." );
899
+
900
+ // Cost model is not run in the VPlan-native path - return conservative
901
+ // result until this changes.
902
+ if (EnableVPlanNativePath)
903
+ return false ;
904
+
891
905
auto Scalars = InstsToScalarize.find (VF);
892
906
assert (Scalars != InstsToScalarize.end () &&
893
907
" VF not yet analyzed for scalarization profitability" );
@@ -898,6 +912,12 @@ class LoopVectorizationCostModel {
898
912
bool isUniformAfterVectorization (Instruction *I, unsigned VF) const {
899
913
if (VF == 1 )
900
914
return true ;
915
+
916
+ // Cost model is not run in the VPlan-native path - return conservative
917
+ // result until this changes.
918
+ if (EnableVPlanNativePath)
919
+ return false ;
920
+
901
921
auto UniformsPerVF = Uniforms.find (VF);
902
922
assert (UniformsPerVF != Uniforms.end () &&
903
923
" VF not yet analyzed for uniformity" );
@@ -908,6 +928,12 @@ class LoopVectorizationCostModel {
908
928
bool isScalarAfterVectorization (Instruction *I, unsigned VF) const {
909
929
if (VF == 1 )
910
930
return true ;
931
+
932
+ // Cost model is not run in the VPlan-native path - return conservative
933
+ // result until this changes.
934
+ if (EnableVPlanNativePath)
935
+ return false ;
936
+
911
937
auto ScalarsPerVF = Scalars.find (VF);
912
938
assert (ScalarsPerVF != Scalars.end () &&
913
939
" Scalar values are not calculated for VF" );
@@ -962,6 +988,12 @@ class LoopVectorizationCostModel {
962
988
// / through the cost modeling.
963
989
InstWidening getWideningDecision (Instruction *I, unsigned VF) {
964
990
assert (VF >= 2 && " Expected VF >=2" );
991
+
992
+ // Cost model is not run in the VPlan-native path - return conservative
993
+ // result until this changes.
994
+ if (EnableVPlanNativePath)
995
+ return CM_GatherScatter;
996
+
965
997
std::pair<Instruction *, unsigned > InstOnVF = std::make_pair (I, VF);
966
998
auto Itr = WideningDecisions.find (InstOnVF);
967
999
if (Itr == WideningDecisions.end ())
@@ -1397,8 +1429,16 @@ struct LoopVectorize : public FunctionPass {
1397
1429
AU.addRequired <LoopAccessLegacyAnalysis>();
1398
1430
AU.addRequired <DemandedBitsWrapperPass>();
1399
1431
AU.addRequired <OptimizationRemarkEmitterWrapperPass>();
1400
- AU.addPreserved <LoopInfoWrapperPass>();
1401
- AU.addPreserved <DominatorTreeWrapperPass>();
1432
+
1433
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
1434
+ // vectorization. Until this is addressed, mark these analyses as preserved
1435
+ // only for non-VPlan-native path.
1436
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1437
+ if (!EnableVPlanNativePath) {
1438
+ AU.addPreserved <LoopInfoWrapperPass>();
1439
+ AU.addPreserved <DominatorTreeWrapperPass>();
1440
+ }
1441
+
1402
1442
AU.addPreserved <BasicAAWrapperPass>();
1403
1443
AU.addPreserved <GlobalsAAWrapperPass>();
1404
1444
}
@@ -1749,8 +1789,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
1749
1789
assert (!V->getType ()->isVectorTy () && " Can't widen a vector" );
1750
1790
assert (!V->getType ()->isVoidTy () && " Type does not produce a value" );
1751
1791
1752
- // If we have a stride that is replaced by one, do it here.
1753
- if (Legal->hasStride (V))
1792
+ // If we have a stride that is replaced by one, do it here. Defer this for
1793
+ // the VPlan-native path until we start running Legal checks in that path.
1794
+ if (!EnableVPlanNativePath && Legal->hasStride (V))
1754
1795
V = ConstantInt::get (V->getType (), 1 );
1755
1796
1756
1797
// If we have a vector mapped to this value, return it.
@@ -2416,6 +2457,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2416
2457
}
2417
2458
2418
2459
void InnerLoopVectorizer::emitMemRuntimeChecks (Loop *L, BasicBlock *Bypass) {
2460
+ // VPlan-native path does not do any analysis for runtime checks currently.
2461
+ if (EnableVPlanNativePath)
2462
+ return ;
2463
+
2419
2464
BasicBlock *BB = L->getLoopPreheader ();
2420
2465
2421
2466
// Generate the code that checks in runtime if arrays overlap. We put the
@@ -3060,6 +3105,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
3060
3105
if (VF > 1 )
3061
3106
truncateToMinimalBitwidths ();
3062
3107
3108
+ // Fix widened non-induction PHIs by setting up the PHI operands.
3109
+ if (OrigPHIsToFix.size ()) {
3110
+ assert (EnableVPlanNativePath &&
3111
+ " Unexpected non-induction PHIs for fixup in non VPlan-native path" );
3112
+ fixNonInductionPHIs ();
3113
+ }
3114
+
3063
3115
// At this point every instruction in the original loop is widened to a
3064
3116
// vector form. Now we need to fix the recurrences in the loop. These PHI
3065
3117
// nodes are currently empty because we did not want to introduce cycles.
@@ -3532,12 +3584,62 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3532
3584
} while (Changed);
3533
3585
}
3534
3586
3587
+ void InnerLoopVectorizer::fixNonInductionPHIs () {
3588
+ for (PHINode *OrigPhi : OrigPHIsToFix) {
3589
+ PHINode *NewPhi =
3590
+ cast<PHINode>(VectorLoopValueMap.getVectorValue (OrigPhi, 0 ));
3591
+ unsigned NumIncomingValues = OrigPhi->getNumIncomingValues ();
3592
+
3593
+ SmallVector<BasicBlock *, 2 > ScalarBBPredecessors (
3594
+ predecessors (OrigPhi->getParent ()));
3595
+ SmallVector<BasicBlock *, 2 > VectorBBPredecessors (
3596
+ predecessors (NewPhi->getParent ()));
3597
+ assert (ScalarBBPredecessors.size () == VectorBBPredecessors.size () &&
3598
+ " Scalar and Vector BB should have the same number of predecessors" );
3599
+
3600
+ // The insertion point in Builder may be invalidated by the time we get
3601
+ // here. Force the Builder insertion point to something valid so that we do
3602
+ // not run into issues during insertion point restore in
3603
+ // getOrCreateVectorValue calls below.
3604
+ Builder.SetInsertPoint (NewPhi);
3605
+
3606
+ // The predecessor order is preserved and we can rely on mapping between
3607
+ // scalar and vector block predecessors.
3608
+ for (unsigned i = 0 ; i < NumIncomingValues; ++i) {
3609
+ BasicBlock *NewPredBB = VectorBBPredecessors[i];
3610
+
3611
+ // When looking up the new scalar/vector values to fix up, use incoming
3612
+ // values from original phi.
3613
+ Value *ScIncV =
3614
+ OrigPhi->getIncomingValueForBlock (ScalarBBPredecessors[i]);
3615
+
3616
+ // Scalar incoming value may need a broadcast
3617
+ Value *NewIncV = getOrCreateVectorValue (ScIncV, 0 );
3618
+ NewPhi->addIncoming (NewIncV, NewPredBB);
3619
+ }
3620
+ }
3621
+ }
3622
+
3535
3623
void InnerLoopVectorizer::widenPHIInstruction (Instruction *PN, unsigned UF,
3536
3624
unsigned VF) {
3625
+ PHINode *P = cast<PHINode>(PN);
3626
+ if (EnableVPlanNativePath) {
3627
+ // Currently we enter here in the VPlan-native path for non-induction
3628
+ // PHIs where all control flow is uniform. We simply widen these PHIs.
3629
+ // Create a vector phi with no operands - the vector phi operands will be
3630
+ // set at the end of vector code generation.
3631
+ Type *VecTy =
3632
+ (VF == 1 ) ? PN->getType () : VectorType::get (PN->getType (), VF);
3633
+ Value *VecPhi = Builder.CreatePHI (VecTy, PN->getNumOperands (), " vec.phi" );
3634
+ VectorLoopValueMap.setVectorValue (P, 0 , VecPhi);
3635
+ OrigPHIsToFix.push_back (P);
3636
+
3637
+ return ;
3638
+ }
3639
+
3537
3640
assert (PN->getParent () == OrigLoop->getHeader () &&
3538
3641
" Non-header phis should have been handled elsewhere" );
3539
3642
3540
- PHINode *P = cast<PHINode>(PN);
3541
3643
// In order to support recurrences we need to be able to vectorize Phi nodes.
3542
3644
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
3543
3645
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
@@ -3893,6 +3995,10 @@ void InnerLoopVectorizer::updateAnalysis() {
3893
3995
// Forget the original basic block.
3894
3996
PSE.getSE ()->forgetLoop (OrigLoop);
3895
3997
3998
+ // DT is not kept up-to-date for outer loop vectorization
3999
+ if (EnableVPlanNativePath)
4000
+ return ;
4001
+
3896
4002
// Update the dominator tree information.
3897
4003
assert (DT->properlyDominates (LoopBypassBlocks.front (), LoopExitBlock) &&
3898
4004
" Entry does not dominate exit." );
@@ -6527,6 +6633,13 @@ LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
6527
6633
VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
6528
6634
HCFGBuilder.buildHierarchicalCFG ();
6529
6635
6636
+ SmallPtrSet<Instruction *, 1 > DeadInstructions;
6637
+ VPlanHCFGTransforms::VPInstructionsToVPRecipes (
6638
+ Plan, Legal->getInductionVars (), DeadInstructions);
6639
+
6640
+ for (unsigned VF = Range.Start ; VF < Range.End ; VF *= 2 )
6641
+ Plan->addVF (VF);
6642
+
6530
6643
return Plan;
6531
6644
}
6532
6645
@@ -6728,11 +6841,26 @@ static bool processLoopInVPlanNativePath(
6728
6841
Hints.getForce () != LoopVectorizeHints::FK_Enabled && F->optForSize ();
6729
6842
6730
6843
// Plan how to best vectorize, return the best VF and its cost.
6731
- LVP.planInVPlanNativePath (OptForSize, UserVF);
6844
+ VectorizationFactor VF = LVP.planInVPlanNativePath (OptForSize, UserVF);
6732
6845
6733
- // Returning false. We are currently not generating vector code in the VPlan
6734
- // native path.
6735
- return false ;
6846
+ // If we are stress testing VPlan builds, do not attempt to generate vector
6847
+ // code.
6848
+ if (VPlanBuildStressTest)
6849
+ return false ;
6850
+
6851
+ LVP.setBestPlan (VF.Width , 1 );
6852
+
6853
+ InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, UserVF, 1 , LVL,
6854
+ &CM);
6855
+ LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
6856
+ << L->getHeader ()->getParent ()->getName () << " \"\n " );
6857
+ LVP.executePlan (LB, DT);
6858
+
6859
+ // Mark the loop as already vectorized to avoid vectorizing again.
6860
+ Hints.setAlreadyVectorized ();
6861
+
6862
+ LLVM_DEBUG (verifyFunction (*L->getHeader ()->getParent ()));
6863
+ return true ;
6736
6864
}
6737
6865
6738
6866
bool LoopVectorizePass::processLoop (Loop *L) {
@@ -7123,8 +7251,15 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
7123
7251
if (!Changed)
7124
7252
return PreservedAnalyses::all ();
7125
7253
PreservedAnalyses PA;
7126
- PA.preserve <LoopAnalysis>();
7127
- PA.preserve <DominatorTreeAnalysis>();
7254
+
7255
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
7256
+ // vectorization. Until this is addressed, mark these analyses as preserved
7257
+ // only for non-VPlan-native path.
7258
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7259
+ if (!EnableVPlanNativePath) {
7260
+ PA.preserve <LoopAnalysis>();
7261
+ PA.preserve <DominatorTreeAnalysis>();
7262
+ }
7128
7263
PA.preserve <BasicAA>();
7129
7264
PA.preserve <GlobalsAA>();
7130
7265
return PA;
0 commit comments