Skip to content

Commit

Permalink
[VPlan] Compute induction end values in VPlan.
Browse files Browse the repository at this point in the history
Use createDerivedIV to compute IV end values directly in VPlan, instead
of creating them up-front.

This allows updating IV users outside the loop as follow-up.

Depends on llvm#110004 and
llvm#109975.
  • Loading branch information
fhahn committed Nov 4, 2024
1 parent 572cff5 commit d65cdf1
Show file tree
Hide file tree
Showing 33 changed files with 262 additions and 189 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ class VPBuilder {

VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, VPValue *Start,
VPCanonicalIVPHIRecipe *CanonicalIV,
VPValue *Step, const Twine &Name = "") {
VPValue *CanonicalIV, VPValue *Step,
const Twine &Name = "") {
return tryInsertInstruction(
new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step, Name));
}
Expand Down
215 changes: 152 additions & 63 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ class InnerLoopVectorizer {
/// and the resume values can come from an additional bypass block, the \p
/// AdditionalBypass pair provides information about the bypass block and the
/// end value on the edge from bypass to this loop.
void createInductionResumeValue(
void createInductionBypassValue(
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
Expand Down Expand Up @@ -574,15 +574,11 @@ class InnerLoopVectorizer {
/// vector loop preheader, middle block and scalar preheader.
void createVectorLoopSkeleton(StringRef Prefix);

/// Create new phi nodes for the induction variables to resume iteration count
/// in the scalar epilogue, from where the vectorized loop left off.
/// In cases where the loop skeleton is more complicated (eg. epilogue
/// vectorization) and the resume values can come from an additional bypass
/// block, the \p AdditionalBypass pair provides information about the bypass
/// block and the end value on the edge from bypass to this loop.
void createInductionResumeValues(
/// Create values for the induction variables to resume iteration count
/// in bypass block.
void createInductionBypassValues(
const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
std::pair<BasicBlock *, Value *> AdditionalBypass);

/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
Expand Down Expand Up @@ -2602,30 +2598,19 @@ static void addOperandToPhiInVPIRBasicBlock(VPIRBasicBlock *VPBB, PHINode *P,
}
}

void InnerLoopVectorizer::createInductionResumeValue(
void InnerLoopVectorizer::createInductionBypassValue(
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
ArrayRef<BasicBlock *> BypassBlocks,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
assert(VectorTripCount && "Expected valid arguments");

Instruction *OldInduction = Legal->getPrimaryInduction();
Value *EndValue = nullptr;
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = VectorTripCount;
} else {
if (OrigPhi != OldInduction) {
IRBuilder<> B(LoopVectorPreHeader->getTerminator());

// Fast-math-flags propagate from the original induction instruction.
if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
Step, II.getKind(), II.getInductionBinOp());
EndValue->setName("ind.end");

// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
B.SetInsertPoint(AdditionalBypass.first,
Expand All @@ -2637,26 +2622,6 @@ void InnerLoopVectorizer::createInductionResumeValue(
}
}

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi,
{Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
OrigPhi->getDebugLoc(), "bc.resume.val");

auto *ScalarLoopHeader =
cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
EndValueFromAdditionalBypass};
}
Expand All @@ -2675,23 +2640,16 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
return I->second;
}

void InnerLoopVectorizer::createInductionResumeValues(
void InnerLoopVectorizer::createInductionBypassValues(
const SCEV2ValueTy &ExpandedSCEVs,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
// PHIs that are left in the scalar version of the loop.
// The starting values of PHI nodes depend on the counter of the last
// iteration in the vectorized loop.
// If we come from a bypass edge then we need to start from the original
// start value.
assert(AdditionalBypass.first && AdditionalBypass.second &&
"Must have bypass information");

for (const auto &InductionEntry : Legal->getInductionVars()) {
PHINode *OrigPhi = InductionEntry.first;
const InductionDescriptor &II = InductionEntry.second;
createInductionResumeValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
createInductionBypassValue(OrigPhi, II, getExpandedStep(II, ExpandedSCEVs),
LoopBypassBlocks, AdditionalBypass);
}
}
Expand Down Expand Up @@ -2754,8 +2712,8 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
// faster.
emitMemRuntimeChecks(LoopScalarPreHeader);

// Emit phis for the new starting index of the scalar loop.
createInductionResumeValues(ExpandedSCEVs);
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
assert(VectorTripCount && "Expected valid arguments");

return {LoopVectorPreHeader, nullptr};
}
Expand Down Expand Up @@ -7719,6 +7677,18 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.getOrCreateVectorTripCount(nullptr),
CanonicalIVStartValue, State);

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

BestVPlan.execute(&State);

// 2.5 Collect reduction resume values.
Expand Down Expand Up @@ -7836,7 +7806,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
} else
continue;

createInductionResumeValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
createInductionBypassValue(IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
LoopBypassBlocks);
}

Expand Down Expand Up @@ -8006,20 +7976,22 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();

PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
EPI.MainLoopIterationCountCheck);

// Generate induction resume values. These variables save the new starting
// indexes for the scalar loop. They are used to test if there are any tail
// iterations left once the vector loop has completed.
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
assert(VectorTripCount && "Expected valid arguments");

// Generate induction resume values for the bypass blocks.
// Note that when the vectorized epilogue is skipped due to iteration count
// check, then the resume value for the induction variable comes from
// the trip count of the main vector loop, hence passing the AdditionalBypass
// argument.
createInductionResumeValues(ExpandedSCEVs,
createInductionBypassValues(ExpandedSCEVs,
{VecEpilogueIterationCountCheck,
EPI.VectorTripCount} /* AdditionalBypass */);

Expand Down Expand Up @@ -8932,6 +8904,74 @@ addUsersInExitBlock(VPlan &Plan,
}
}

static void addResumeValuesForInductions(VPlan &Plan) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();

VPBuilder Builder(
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
for (VPRecipeBase &R : Header->phis()) {
PHINode *OrigPhi;
const InductionDescriptor *ID;
VPValue *Start;
VPValue *Step;
Type *ScalarTy;
bool IsCanonical = false;
if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
if (WideIV->getTruncInst())
continue;
OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
ID = &WideIV->getInductionDescriptor();
Start = WideIV->getStartValue();
Step = WideIV->getStepValue();
ScalarTy = WideIV->getScalarType();
IsCanonical = WideIV->isCanonical();
} else if (auto *WideIV = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
ID = &WideIV->getInductionDescriptor();
Start = WideIV->getStartValue();
Step = WideIV->getOperand(1);
ScalarTy = Start->getLiveInIRValue()->getType();
} else {
continue;
}

VPValue *EndValue = &Plan.getVectorTripCount();
if (!IsCanonical) {
EndValue = Builder.createDerivedIV(
ID->getKind(),
dyn_cast_or_null<FPMathOperator>(ID->getInductionBinOp()), Start,
&Plan.getVectorTripCount(), Step);
}

if (ScalarTy != TypeInfo.inferScalarType(EndValue)) {
EndValue =
Builder.createScalarCast(Instruction::Trunc, EndValue, ScalarTy);
}

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {EndValue, Start}, OrigPhi->getDebugLoc(),
"bc.resume.val");

auto *ScalarLoopHeader =
cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
}
}

/// Handle users in the exit block for first order reductions in the original
/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
/// users in the original exit block using the VPIRInstruction wrapping to the
Expand Down Expand Up @@ -9205,6 +9245,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);
addResumeValuesForInductions(*Plan);

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
// bring the VPlan to its final state.
Expand Down Expand Up @@ -9315,6 +9357,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
addResumeValuesForInductions(*Plan);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
Expand Down Expand Up @@ -9599,7 +9642,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName(Name);
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
/* assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&*/
/*"IV didn't need transforming?");*/

State.set(this, DerivedIV, VPLane(0));
}
Expand Down Expand Up @@ -10268,6 +10312,52 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks,
*BestMainPlan);

VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
// Collect PHI nodes of wide inductions in the VPlan for the epilogue.
// Those will need their resume-values computed from the main vector
// loop. Others can be removed in the main VPlan.
SmallPtrSet<PHINode *, 2> WidenedPhis;
for (VPRecipeBase &R :
BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (!isa<VPWidenIntOrFpInductionRecipe,
VPWidenPointerInductionRecipe>(&R))
continue;
if (isa<VPWidenIntOrFpInductionRecipe>(&R))
WidenedPhis.insert(
cast<VPWidenIntOrFpInductionRecipe>(&R)->getPHINode());
else
WidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
}
VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(
BestMainPlan->getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

for (VPRecipeBase &R :
*cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor())) {
auto *VPIRInst = cast<VPIRInstruction>(&R);
auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
if (!IRI)
break;
if (WidenedPhis.contains(IRI) ||
!LVL.getInductionVars().contains(IRI))
continue;
VPRecipeBase *ResumePhi =
VPIRInst->getOperand(0)->getDefiningRecipe();
VPIRInst->setOperand(0, BestMainPlan->getOrAddLiveIn(
Constant::getNullValue(IRI->getType())));
ResumePhi->eraseFromParent();
}
VPlanTransforms::removeDeadRecipes(*BestMainPlan);

auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, false);
++LoopsVectorized;
Expand All @@ -10276,7 +10366,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks, BestEpiPlan);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
case VPInstruction::ResumePhi:
return false;
default:
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
Expand Down Expand Up @@ -102,9 +102,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
Expand Down
Loading

0 comments on commit d65cdf1

Please sign in to comment.