Skip to content

Commit

Permalink
[VPlan] Compute induction end values in VPlan.
Browse files Browse the repository at this point in the history
Use createDerivedIV to compute IV end values directly in VPlan, instead
of creating them up-front.

This allows updating IV users outside the loop as follow-up.

Depends on llvm#110004 and
llvm#109975.
  • Loading branch information
fhahn committed Oct 21, 2024
1 parent 5c7833b commit 3b414ba
Show file tree
Hide file tree
Showing 33 changed files with 347 additions and 247 deletions.
3 changes: 1 addition & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,7 @@ class VPBuilder {

VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
FPMathOperator *FPBinOp, VPValue *Start,
VPCanonicalIVPHIRecipe *CanonicalIV,
VPValue *Step) {
VPValue *CanonicalIV, VPValue *Step) {
return tryInsertInstruction(
new VPDerivedIVRecipe(Kind, FPBinOp, Start, CanonicalIV, Step));
}
Expand Down
160 changes: 130 additions & 30 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2610,22 +2610,15 @@ void InnerLoopVectorizer::createInductionResumeValue(
assert(VectorTripCount && "Expected valid arguments");

Instruction *OldInduction = Legal->getPrimaryInduction();
Value *EndValue = nullptr;
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
if (OrigPhi == OldInduction) {
// We know what the end value is.
EndValue = VectorTripCount;
} else {
IRBuilder<> B(LoopVectorPreHeader->getTerminator());

// Fast-math-flags propagate from the original induction instruction.
if (isa_and_nonnull<FPMathOperator>(II.getInductionBinOp()))
B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());

EndValue = emitTransformedIndex(B, VectorTripCount, II.getStartValue(),
Step, II.getKind(), II.getInductionBinOp());
EndValue->setName("ind.end");

// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
B.SetInsertPoint(AdditionalBypass.first,
Expand All @@ -2637,26 +2630,6 @@ void InnerLoopVectorizer::createInductionResumeValue(
}
}

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi,
{Plan.getOrAddLiveIn(EndValue), Plan.getOrAddLiveIn(II.getStartValue())},
OrigPhi->getDebugLoc(), "bc.resume.val");

auto *ScalarLoopHeader =
cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
InductionBypassValues[OrigPhi] = {AdditionalBypass.first,
EndValueFromAdditionalBypass};
}
Expand Down Expand Up @@ -7704,10 +7677,22 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
ILV.getOrCreateVectorTripCount(nullptr),
CanonicalIVStartValue, State);

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

BestVPlan.execute(&State);

// 2.5 Collect reduction resume values.
auto *ExitVPBB =
VPBasicBlock *ExitVPBB =
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
for (VPRecipeBase &R : *ExitVPBB) {
createAndCollectMergePhiForReduction(
Expand Down Expand Up @@ -7992,6 +7977,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
// Generate a resume induction for the vector epilogue and put it in the
// vector epilogue preheader
Type *IdxTy = Legal->getWidestInductionType();

PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val");
EPResumeVal->insertBefore(LoopVectorPreHeader->getFirstNonPHIIt());
EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
Expand Down Expand Up @@ -8879,6 +8865,74 @@ addUsersInExitBlock(VPlan &Plan,
}
}

static void addResumeValuesForInductions(VPlan &Plan) {
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();

VPBuilder Builder(
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor()));
for (VPRecipeBase &R : Header->phis()) {
PHINode *OrigPhi;
const InductionDescriptor *ID;
VPValue *Start;
VPValue *Step;
Type *ScalarTy;
bool IsCanonical = false;
if (auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
if (WideIV->getTruncInst())
continue;
OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
ID = &WideIV->getInductionDescriptor();
Start = WideIV->getStartValue();
Step = WideIV->getStepValue();
ScalarTy = WideIV->getScalarType();
IsCanonical = WideIV->isCanonical();
} else if (auto *WideIV = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
OrigPhi = cast<PHINode>(WideIV->getUnderlyingValue());
ID = &WideIV->getInductionDescriptor();
Start = WideIV->getStartValue();
Step = WideIV->getOperand(1);
ScalarTy = Start->getLiveInIRValue()->getType();
} else {
continue;
}

VPValue *EndValue = &Plan.getVectorTripCount();
if (!IsCanonical) {
EndValue = Builder.createDerivedIV(
ID->getKind(),
dyn_cast_or_null<FPMathOperator>(ID->getInductionBinOp()), Start,
&Plan.getVectorTripCount(), Step);
}

if (ScalarTy != TypeInfo.inferScalarType(EndValue)) {
EndValue =
Builder.createScalarCast(Instruction::Trunc, EndValue, ScalarTy);
}

VPBasicBlock *MiddleVPBB =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

VPBuilder ScalarPHBuilder(ScalarPHVPBB);
auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
VPInstruction::ResumePhi, {EndValue, Start}, OrigPhi->getDebugLoc(),
"bc.resume.val");

auto *ScalarLoopHeader =
cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor());
addOperandToPhiInVPIRBasicBlock(ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
}
}

/// Handle live-outs for first order reductions, both in the scalar preheader
/// and the original exit block:
/// 1. Feed a resume value for every FOR from the vector loop to the scalar
Expand Down Expand Up @@ -9174,6 +9228,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
addLiveOutsForFirstOrderRecurrences(*Plan, ExitUsersToFix);
addUsersInExitBlock(*Plan, ExitUsersToFix);
addResumeValuesForInductions(*Plan);

// ---------------------------------------------------------------------------
// Transform initial VPlan: Apply previously taken decisions, in order, to
Expand Down Expand Up @@ -9279,6 +9334,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
addResumeValuesForInductions(*Plan);
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
Expand Down Expand Up @@ -9562,7 +9618,8 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) {
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step,
Kind, cast_if_present<BinaryOperator>(FPBinOp));
DerivedIV->setName("offset.idx");
assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
assert((isa<Constant>(CanonicalIV) || DerivedIV != CanonicalIV) &&
"IV didn't need transforming?");

State.set(this, DerivedIV, VPLane(0));
}
Expand Down Expand Up @@ -10231,6 +10288,50 @@ bool LoopVectorizePass::processLoop(Loop *L) {
EPI, &LVL, &CM, BFI, PSI, Checks,
*BestMainPlan);

VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
// Collect PHI nodes of wide inductions in the VPlan for the epilogue. Those will need their resume-values computed from the main vector loop. Others can be removed in the main VPlan.
SmallPtrSet<PHINode *, 2> WidenedPhis;
for (VPRecipeBase &R :
BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
if (!isa<VPWidenIntOrFpInductionRecipe,
VPWidenPointerInductionRecipe>(&R))
continue;
if (isa<VPWidenIntOrFpInductionRecipe>(&R))
WidenedPhis.insert(
cast<VPWidenIntOrFpInductionRecipe>(&R)->getPHINode());
else
WidenedPhis.insert(
cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
}
VPBasicBlock *MiddleVPBB = cast<VPBasicBlock>(
BestMainPlan->getVectorLoopRegion()->getSingleSuccessor());

VPBasicBlock *ScalarPHVPBB = nullptr;
if (MiddleVPBB->getNumSuccessors() == 2) {
// Order is strict: first is the exit block, second is the scalar
// preheader.
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors()[1]);
} else {
ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor());
}

for (VPRecipeBase &R :
*cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor())) {
auto *VPIRInst = cast<VPIRInstruction>(&R);
auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
if (!IRI)
break;
if (WidenedPhis.contains(IRI) ||
!LVL.getInductionVars().contains(IRI))
continue;
VPRecipeBase *ResumePhi =
VPIRInst->getOperand(0)->getDefiningRecipe();
VPIRInst->setOperand(0, BestMainPlan->getOrAddLiveIn(
Constant::getNullValue(IRI->getType())));
ResumePhi->eraseFromParent();
}
VPlanTransforms::removeDeadRecipes(*BestMainPlan);

auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
*BestMainPlan, MainILV, DT, true);
++LoopsVectorized;
Expand All @@ -10239,7 +10340,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// edges from the first pass.
EPI.MainLoopVF = EPI.EpilogueVF;
EPI.MainLoopUF = EPI.EpilogueUF;
VPlan &BestEpiPlan = LVP.getPlanFor(EPI.EpilogueVF);
EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
ORE, EPI, &LVL, &CM, BFI, PSI,
Checks, BestEpiPlan);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPInstruction::FirstOrderRecurrenceSplice:
case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
case VPInstruction::ResumePhi:
return false;
default:
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
Expand Down Expand Up @@ -102,9 +102,9 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,11 +785,11 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; PRED-NEXT: [[TMP3:%.*]] = mul i64 [[N_VEC]], 8
; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]]
Expand Down
26 changes: 13 additions & 13 deletions llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll
Original file line number Diff line number Diff line change
Expand Up @@ -522,31 +522,31 @@ define void @trunc_ivs_and_store(i32 %x, ptr %dst, i64 %N) #0 {
; PRED: pred.store.continue:
; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
; PRED-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
; PRED: pred.store.if2:
; PRED: pred.store.if3:
; PRED-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP18]], i32 1
; PRED-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP24]]
; PRED-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 1
; PRED-NEXT: store i32 [[TMP26]], ptr [[TMP25]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE4]]
; PRED: pred.store.continue3:
; PRED: pred.store.continue4:
; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
; PRED-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
; PRED: pred.store.if4:
; PRED: pred.store.if5:
; PRED-NEXT: [[TMP28:%.*]] = extractelement <4 x i64> [[TMP18]], i32 2
; PRED-NEXT: [[TMP29:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP28]]
; PRED-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], 2
; PRED-NEXT: store i32 [[TMP30]], ptr [[TMP29]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE6]]
; PRED: pred.store.continue5:
; PRED: pred.store.continue6:
; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
; PRED-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]]
; PRED: pred.store.if6:
; PRED: pred.store.if7:
; PRED-NEXT: [[TMP32:%.*]] = extractelement <4 x i64> [[TMP18]], i32 3
; PRED-NEXT: [[TMP33:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP32]]
; PRED-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], 3
; PRED-NEXT: store i32 [[TMP34]], ptr [[TMP33]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE8]]
; PRED: pred.store.continue7:
; PRED: pred.store.continue8:
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP16]])
; PRED-NEXT: [[TMP35:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
Expand Down Expand Up @@ -719,31 +719,31 @@ define void @ivs_trunc_and_ext(i32 %x, ptr %dst, i64 %N) #0 {
; PRED: pred.store.continue:
; PRED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 1
; PRED-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3:%.*]]
; PRED: pred.store.if1:
; PRED: pred.store.if2:
; PRED-NEXT: [[TMP23:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
; PRED-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP23]]
; PRED-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 1
; PRED-NEXT: store i32 [[TMP25]], ptr [[TMP24]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE3]]
; PRED: pred.store.continue2:
; PRED: pred.store.continue3:
; PRED-NEXT: [[TMP26:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 2
; PRED-NEXT: br i1 [[TMP26]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
; PRED: pred.store.if3:
; PRED: pred.store.if4:
; PRED-NEXT: [[TMP27:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
; PRED-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP27]]
; PRED-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], 2
; PRED-NEXT: store i32 [[TMP29]], ptr [[TMP28]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE5]]
; PRED: pred.store.continue4:
; PRED: pred.store.continue5:
; PRED-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK]], i32 3
; PRED-NEXT: br i1 [[TMP30]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]]
; PRED: pred.store.if5:
; PRED: pred.store.if6:
; PRED-NEXT: [[TMP31:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
; PRED-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP31]]
; PRED-NEXT: [[TMP33:%.*]] = add i32 [[OFFSET_IDX]], 3
; PRED-NEXT: store i32 [[TMP33]], ptr [[TMP32]], align 4
; PRED-NEXT: br label [[PRED_STORE_CONTINUE7]]
; PRED: pred.store.continue6:
; PRED: pred.store.continue7:
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP15]])
; PRED-NEXT: [[TMP34:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], <i1 true, i1 true, i1 true, i1 true>
Expand Down Expand Up @@ -863,8 +863,8 @@ define void @exit_cond_zext_iv(ptr %dst, i64 %N) {
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX1]], 1
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
; PRED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
; PRED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX1]], 1
; PRED-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i32
; PRED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
; PRED-NEXT: br label [[LOOP:%.*]]
Expand Down
Loading

0 comments on commit 3b414ba

Please sign in to comment.