-
Notifications
You must be signed in to change notification settings - Fork 14.1k
[VPlan] Explicitly handle scalar pointer inductions. #80273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[VPlan] Explicitly handle scalar pointer inductions. #80273
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesAdd a new PtrAdd opcode to VPInstruction that corresponds to This is then used to model scalarizing VPWidenPointerInductionRecipe by Note that PtrAdd needs to be able to generate code for only the first Patch is 125.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80273.diff 18 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 17a0d01f18072..4ee878358f9bc 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9233,42 +9233,11 @@ void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
"Not a pointer induction according to InductionDescriptor!");
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
"Unexpected type.");
+ assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
+ "Recipe should have been replaced");
auto *IVR = getParent()->getPlan()->getCanonicalIV();
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
-
- if (onlyScalarsGenerated(State.VF)) {
- // This is the normalized GEP that starts counting at zero.
- Value *PtrInd = State.Builder.CreateSExtOrTrunc(
- CanonicalIV, IndDesc.getStep()->getType());
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If the instruction is uniform, we only need to generate the
- // first lane. Otherwise, we generate all VF values.
- bool IsUniform = vputils::onlyFirstLaneUsed(this);
- assert((IsUniform || !State.VF.isScalable()) &&
- "Cannot scalarize a scalable VF");
- unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
-
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- Value *PartStart =
- createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
-
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
- Value *Idx = State.Builder.CreateAdd(
- PartStart, ConstantInt::get(PtrInd->getType(), Lane));
- Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
-
- Value *Step = State.get(getOperand(1), VPIteration(Part, Lane));
- Value *SclrGep = emitTransformedIndex(
- State.Builder, GlobalIdx, IndDesc.getStartValue(), Step,
- IndDesc.getKind(), IndDesc.getInductionBinOp());
- SclrGep->setName("next.gep");
- State.set(this, SclrGep, VPIteration(Part, Lane));
- }
- }
- return;
- }
-
Type *PhiType = IndDesc.getStep()->getType();
// Build a pointer phi
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 7ed07fe5f413a..51576d9f5364e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -859,7 +859,7 @@ void VPlan::execute(VPTransformState *State) {
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
// TODO: Split off the case that all users of a pointer phi are scalar
// from the VPWidenPointerInductionRecipe.
- if (WidenPhi->onlyScalarsGenerated(State->VF))
+ if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
continue;
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 97035146a2f4d..4904287412f8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1156,6 +1156,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
BranchOnCount,
BranchOnCond,
ComputeReductionResult,
+ PtrAdd,
};
private:
@@ -1736,7 +1737,7 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
void execute(VPTransformState &State) override;
/// Returns true if only scalar values will be generated.
- bool onlyScalarsGenerated(ElementCount VF);
+ bool onlyScalarsGenerated(bool Scalable);
/// Returns the induction descriptor for the recipe.
const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
@@ -2502,6 +2503,12 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()),
Start, CanonicalIV, Step) {}
+ VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, VPValue *Start,
+ VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
+ FPMathOperator *FPBinOp)
+ : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
+ Kind(Kind), FPBinOp(FPBinOp) {}
+
~VPDerivedIVRecipe() override = default;
VPRecipeBase *clone() override {
@@ -2957,6 +2964,9 @@ class VPlan {
}
bool hasVF(ElementCount VF) { return VFs.count(VF); }
+ bool hasScalableVF() {
+ return any_of(VFs, [](ElementCount VF) { return VF.isScalable(); });
+ }
bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 515dc41a55ea1..bd2f65935e479 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -43,6 +43,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
CachedTypes[OtherV] = ResTy;
return ResTy;
}
+ case VPInstruction::PtrAdd:
+ return inferScalarType(R->getOperand(0));
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 77f2cf899b085..d6550821fb57a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -127,6 +127,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
+ case VPInstruction::PtrAdd:
return false;
default:
return true;
@@ -489,6 +490,23 @@ Value *VPInstruction::generateInstruction(VPTransformState &State,
return ReducedPartRdx;
}
+ case VPInstruction::PtrAdd: {
+ if (vputils::onlyFirstLaneUsed(this)) {
+ auto *P =
+ Builder.CreatePtrAdd(State.get(getOperand(0), VPIteration(Part, 0)),
+ State.get(getOperand(1), VPIteration(Part, 0)));
+ State.set(this, P, VPIteration(Part, 0));
+ } else {
+ for (unsigned Lane = 0; Lane != State.VF.getKnownMinValue(); ++Lane) {
+ Value *P = Builder.CreatePtrAdd(
+ State.get(getOperand(0), VPIteration(Part, Lane)),
+ State.get(getOperand(1), VPIteration(Part, Lane)));
+
+ State.set(this, P, VPIteration(Part, Lane));
+ }
+ }
+ return nullptr;
+ }
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@@ -515,6 +533,8 @@ void VPInstruction::execute(VPTransformState &State) {
State.Builder.setFastMathFlags(getFastMathFlags());
for (unsigned Part = 0; Part < State.UF; ++Part) {
Value *GeneratedValue = generateInstruction(State, Part);
+ if (!GeneratedValue)
+ continue;
if (!hasResult())
continue;
assert(GeneratedValue && "generateInstruction must produce a value");
@@ -598,6 +618,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
+ case VPInstruction::PtrAdd:
+ O << "ptradd";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1686,9 +1709,9 @@ bool VPCanonicalIVPHIRecipe::isCanonical(
return StepC && StepC->isOne();
}
-bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
+bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(bool Scalable) {
return IsScalarAfterVectorization &&
- (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
+ (!Scalable || vputils::onlyFirstLaneUsed(this));
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 71f5285f90236..6964f1bec2675 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -489,15 +489,18 @@ void VPlanTransforms::removeDeadRecipes(VPlan &Plan) {
}
}
-static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
+static VPValue *createScalarIVSteps(VPlan &Plan,
+ InductionDescriptor::InductionKind Kind,
ScalarEvolution &SE, Instruction *TruncI,
VPValue *StartV, VPValue *Step,
- VPBasicBlock::iterator IP) {
+ Instruction::BinaryOps InductionOpcode,
+ VPBasicBlock::iterator IP,
+ FPMathOperator *FPBinOp = nullptr) {
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
VPSingleDefRecipe *BaseIV = CanonicalIV;
- if (!CanonicalIV->isCanonical(ID.getKind(), StartV, Step)) {
- BaseIV = new VPDerivedIVRecipe(ID, StartV, CanonicalIV, Step);
+ if (!CanonicalIV->isCanonical(Kind, StartV, Step)) {
+ BaseIV = new VPDerivedIVRecipe(Kind, StartV, CanonicalIV, Step, FPBinOp);
HeaderVPBB->insert(BaseIV, IP);
}
@@ -526,7 +529,9 @@ static VPValue *createScalarIVSteps(VPlan &Plan, const InductionDescriptor &ID,
VecPreheader->appendRecipe(Step->getDefiningRecipe());
}
- VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(ID, BaseIV, Step);
+ VPScalarIVStepsRecipe *Steps = new VPScalarIVStepsRecipe(
+ BaseIV, Step, InductionOpcode,
+ FPBinOp ? FPBinOp->getFastMathFlags() : FastMathFlags());
HeaderVPBB->insert(Steps, IP);
return Steps;
}
@@ -537,6 +542,30 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1));
VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi();
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+ if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
+ if (!PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
+ continue;
+
+ const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
+ VPValue *StartV = Plan.getVPValueOrAddLiveIn(
+ ConstantInt::get(ID.getStep()->getType(), 0));
+ VPValue *StepV = PtrIV->getOperand(1);
+ VPRecipeBase *Steps =
+ createScalarIVSteps(Plan, InductionDescriptor::IK_IntInduction, SE,
+ nullptr, StartV, StepV, Instruction::Add,
+ InsertPt)
+ ->getDefiningRecipe();
+
+ auto *Recipe =
+ new VPInstruction(VPInstruction::PtrAdd,
+ {PtrIV->getStartValue(), Steps->getVPSingleValue()},
+ PtrIV->getDebugLoc());
+
+ Recipe->insertAfter(Steps);
+ PtrIV->replaceAllUsesWith(Recipe);
+ continue;
+ }
+
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (!WideIV)
continue;
@@ -546,9 +575,10 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
continue;
const InductionDescriptor &ID = WideIV->getInductionDescriptor();
- VPValue *Steps = createScalarIVSteps(Plan, ID, SE, WideIV->getTruncInst(),
- WideIV->getStartValue(),
- WideIV->getStepValue(), InsertPt);
+ VPValue *Steps = createScalarIVSteps(
+ Plan, ID.getKind(), SE, WideIV->getTruncInst(), WideIV->getStartValue(),
+ WideIV->getStepValue(), ID.getInductionOpcode(), InsertPt,
+ dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()));
// Update scalar users of IV to use Step instead.
if (!HasOnlyVectorVFs)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index 24c59fdb47b61..00ec396107dcb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -11,76 +11,74 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 10000
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x ptr> [[TMP2]], ptr [[NEXT_GEP1]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP2]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[NEXT_GEP3]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <2 x ptr> [[TMP3]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]])
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2
-; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1
-; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[PTR_START_1:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> [[TMP6]], ptr [[TMP5]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x ptr> [[TMP10]], ptr [[TMP9]], i32 1
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <2 x ptr> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <2 x ptr> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP14]])
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP15]])
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP16]])
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP17]])
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP4]], i32 2
+; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP18]], align 1
+; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP19]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
; CHECK: middle.block:
; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END6:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
+; CHECK-NEXT: [[IND_END1:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
+; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX8]], 0
-; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX8]], 1
-; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP9]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP10]], i32 1
-; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]])
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]])
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i32 0
-; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1
-; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 10000
-; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX3]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX3]], 1
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP21]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP22]]
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP23]], i32 0
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x ptr> [[TMP25]], ptr [[TMP24]], i32 1
+; CHECK-NEXT: [[TMP27:%.*]] = icmp ne <2 x ptr> [[TMP26]], zeroinitializer
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP27]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP28]])
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[TMP27]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP29]])
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP30]], align 1
+; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX3]], 2
+; CHECK-...
[truncated]
|
Add a new PtrAdd opcode to VPInstruction that corresponds to IRBuilder::CreatePtrAdd, which creates a GEP with source element type i8. This is then used to model scalarizing VPWidenPointerInductionRecipe by introducing scalar-steps to model the index increment followed by a PtrAdd. Note that PtrAdd needs to be able to generate code for only the first lane or for all lanes. This may warrant introducing a separate recipe for scalarizing that can be created without relying on the underlying IR.
2fd69ff
to
916a7d2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice refactoring clean-up! Adding some comments.
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable())) | ||
continue; | ||
|
||
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: assert message.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added, thanks!
@@ -2503,6 +2504,12 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { | |||
dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()), | |||
Start, CanonicalIV, Step) {} | |||
|
|||
VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, VPValue *Start, | |||
VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step, | |||
FPMathOperator *FPBinOp) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this is identical to the private constructor above, except for accepting a non-const FPBinOp?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Made the private one public and removed this one here, thanks!
State.set(this, P, VPIteration(Part, Lane)); | ||
} | ||
} | ||
return nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better for generateInstruction() to continue generate and return a single per-part Value, which is then set in State, possibly renaming it generateValuePerPart(), and have a separate generateValuePerLane() - currently to be invoked only for PtrAdd having all lanes used?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to split into generate for scalars (per lane, possibly optimizing depending on onlyFirstLaneUseD) and generate for vectors (per-part)
Some of it could be done separately or as part of #80271, but would be good to agree on the overall structure first then land separately as makes sense.
@@ -515,6 +533,8 @@ void VPInstruction::execute(VPTransformState &State) { | |||
State.Builder.setFastMathFlags(getFastMathFlags()); | |||
for (unsigned Part = 0; Part < State.UF; ++Part) { | |||
Value *GeneratedValue = generateInstruction(State, Part); | |||
if (!GeneratedValue) | |||
continue; | |||
if (!hasResult()) | |||
continue; | |||
assert(GeneratedValue && "generateInstruction must produce a value"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This assert is now redundant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reworked now, the check for !GeneratedValue is gone now
if (!GeneratedValue) | ||
continue; | ||
if (!hasResult()) | ||
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!GeneratedValue) | |
continue; | |
if (!hasResult()) | |
continue; | |
if (!GeneratedValue || !hasResult()) | |
continue; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Completely reworked this, the check for !GeneratedValue is gone now
if (!CanonicalIV->isCanonical(Kind, StartV, Step)) { | ||
BaseIV = new VPDerivedIVRecipe(Kind, StartV, CanonicalIV, Step, FPBinOp); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this refactoring to accept and pass Kind instead of ID be pushed as a separate simplifying preparation?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, can split off!
@@ -537,6 +542,30 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) { | |||
bool HasOnlyVectorVFs = !Plan.hasVF(ElementCount::getFixed(1)); | |||
VPBasicBlock::iterator InsertPt = HeaderVPBB->getFirstNonPhi(); | |||
for (VPRecipeBase &Phi : HeaderVPBB->phis()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: worth adding a comment describing what unfolds next.
Plus revisit the documentation of optimizeInductions().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added a comment, thanks!
VPValue *StartV = Plan.getVPValueOrAddLiveIn( | ||
ConstantInt::get(ID.getStep()->getType(), 0)); | ||
VPValue *StepV = PtrIV->getOperand(1); | ||
VPRecipeBase *Steps = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Have createScalarIVSteps() return a VPSingleDefRecipe (or even VPScalarIVStepsRecipe) to avoid going through getDefiningRecipe() and getVPSingleValue()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do separately, thanks!
VPValue *Steps = createScalarIVSteps( | ||
Plan, ID.getKind(), SE, WideIV->getTruncInst(), WideIV->getStartValue(), | ||
WideIV->getStepValue(), ID.getInductionOpcode(), InsertPt, | ||
dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: seems more logical to group the three fields of ID and pass them as adjacent parameters?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adjusted, thanks! Can spli off moving induction opcode field separately.
|
||
const InductionDescriptor &ID = PtrIV->getInductionDescriptor(); | ||
VPValue *StartV = Plan.getVPValueOrAddLiveIn( | ||
ConstantInt::get(ID.getStep()->getType(), 0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: would getting the Type of ID.getStartValue() be more consistent?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The start value of the pointer induction is the pointer base, but here we need the start value for the generate offsets.
…or-ptr-iv-to-scalar
…or-ptr-iv-to-scalar
Add a new PtrAdd opcode to VPInstruction that corresponds to
IRBuilder::CreatePtrAdd, which creates a GEP with source element type
i8.
This is then used to model scalarizing VPWidenPointerInductionRecipe by
introducing scalar-steps to model the index increment followed by a
PtrAdd.
Note that PtrAdd needs to be able to generate code for only the first
lane or for all lanes. This may warrant introducing a separate recipe
for scalarizing that can be created without relying on the underlying
IR.
Depends on #80271