-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[VPlan] Add ReductionStartVector VPInstruction. #142290
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-backend-powerpc Author: Florian Hahn (fhahn) ChangesAdd a new VPInstruction::ReductionStartVector opcode to create the start values for wide reductions. This more accurately models the start value creation in VPlan and simplifies VPReductionPHIRecipe::execute. Down the line it also allows removing VPReductionPHIRecipe::RdxDesc. Patch is 35.23 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142290.diff 14 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e9ace195684b3..1830ea9678e85 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7501,8 +7501,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
const RecurrenceDescriptor &RdxDesc =
EpiRedHeaderPhi->getRecurrenceDescriptor();
- Value *MainResumeValue =
- EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
+ Value *MainResumeValue;
+ if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue()))
+ MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
+ else
+ MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
RdxDesc.getRecurrenceKind())) {
auto *Cmp = cast<ICmpInst>(MainResumeValue);
@@ -8552,6 +8555,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
// If the PHI is used by a partial reduction, set the scale factor.
unsigned ScaleFactor =
getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);
+
PhiRecipe = new VPReductionPHIRecipe(
Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
CM.useOrderedReductions(RdxDesc), ScaleFactor);
@@ -9439,7 +9443,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
continue;
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
- Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();
+ Type *PhiTy = PhiR->getUnderlyingValue()->getType();
// If tail is folded by masking, introduce selects between the phi
// and the users outside the vector region of each reduction, at the
// beginning of the dedicated latch block.
@@ -9569,6 +9573,27 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
// start value.
PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));
}
+ RecurKind RK = RdxDesc.getRecurrenceKind();
+ if (PhiR->isOrdered() || PhiR->isInLoop() ||
+ (!RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK) &&
+ !RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))) {
+ VPBuilder PHBuilder(Plan->getVectorPreheader());
+ VPValue *Iden = Plan->getOrAddLiveIn(
+ getRecurrenceIdentity(RK, PhiTy, RdxDesc.getFastMathFlags()));
+ // If the PHI is used by a partial reduction, set the scale factor.
+ unsigned ScaleFactor =
+ RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
+ .value_or(1);
+ Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
+ auto *ScalarFactorVPV =
+ Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+ VPValue *StartV =
+ PHBuilder.createNaryOp(VPInstruction::ReductionStartVector,
+ {PhiR->getStartValue(), Iden, ScalarFactorVPV},
+ RdxDesc.getFastMathFlags());
+ PhiR->setOperand(0, StartV);
+ }
}
for (VPRecipeBase *R : ToDelete)
R->eraseFromParent();
@@ -10081,6 +10106,12 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
}
assert(ResumeV && "Must have a resume value");
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
+ if (auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R)) {
+ if (auto *VPI = dyn_cast<VPInstruction>(PhiR->getStartValue())) {
+ VPI->setOperand(0, StartVal);
+ continue;
+ }
+ }
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 44f0b6d964a6e..c49e20518b506 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -907,6 +907,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
BranchOnCount,
BranchOnCond,
Broadcast,
+ /// Start vector for reductions with 3 operands: the original start value,
+ /// the identity value for the reduction and an integer indicating the
+ /// scaling factor.
+ ReductionStartVector,
ComputeFindLastIVResult,
ComputeReductionResult,
// Extracts the last lane from its operand if it is a vector, or the last
@@ -2225,13 +2229,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Returns true, if the phi is part of an in-loop reduction.
bool isInLoop() const { return IsInLoop; }
-
- /// Returns true if the recipe only uses the first lane of operand \p Op.
- bool onlyFirstLaneUsed(const VPValue *Op) const override {
- assert(is_contained(operands(), Op) &&
- "Op must be an operand of the recipe");
- return Op == getStartValue();
- }
};
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 926490bfad7d0..7cfb07ab25e48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -87,6 +87,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
inferScalarType(R->getOperand(1)) &&
"different types inferred for different operands");
return IntegerType::get(Ctx, 1);
+ case VPInstruction::ReductionStartVector:
+ return inferScalarType(R->getOperand(0));
case VPInstruction::ComputeFindLastIVResult:
case VPInstruction::ComputeReductionResult: {
auto *PhiR = cast<VPReductionPHIRecipe>(R->getOperand(0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index a4831ea7c11f7..0cdd3216288ea 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -604,6 +604,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
return Builder.CreateVectorSplat(
State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
}
+ case VPInstruction::ReductionStartVector: {
+ if (State.VF.isScalar())
+ return State.get(getOperand(0), true);
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(getFastMathFlags());
+ // If this start vector is scaled then it should produce a vector with fewer
+ // elements than the VF.
+ ElementCount VF = State.VF.divideCoefficientBy(
+ cast<ConstantInt>(getOperand(2)->getLiveInIRValue())->getZExtValue());
+ auto *Iden = Builder.CreateVectorSplat(VF, State.get(getOperand(1), true));
+ Constant *Zero = Builder.getInt32(0);
+ return Builder.CreateInsertElement(Iden, State.get(getOperand(0), true),
+ Zero);
+ }
case VPInstruction::ComputeFindLastIVResult: {
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
// and will be removed by breaking up the recipe further.
@@ -892,6 +906,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
case VPInstruction::PtrAdd:
case VPInstruction::WideIVStep:
case VPInstruction::StepVector:
+ case VPInstruction::ReductionStartVector:
return false;
default:
return true;
@@ -922,6 +937,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::BranchOnCount:
case VPInstruction::BranchOnCond:
+ case VPInstruction::ReductionStartVector:
return true;
case VPInstruction::PtrAdd:
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
@@ -1023,6 +1039,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::FirstActiveLane:
O << "first-active-lane";
break;
+ case VPInstruction::ReductionStartVector:
+ O << "reduction-start-vector";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
@@ -1613,6 +1632,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
Opcode == Instruction::FCmp || Opcode == Instruction::Select ||
Opcode == VPInstruction::WideIVStep ||
+ Opcode == VPInstruction::ReductionStartVector ||
Opcode == VPInstruction::ComputeReductionResult;
case OperationType::NonNegOp:
return Opcode == Instruction::ZExt;
@@ -3843,17 +3863,19 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
#endif
void VPReductionPHIRecipe::execute(VPTransformState &State) {
- // If this phi is fed by a scaled reduction then it should output a
- // vector with fewer elements than the VF.
- ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor);
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ VPValue *StartVPV = getStartValue();
// In order to support recurrences we need to be able to vectorize Phi nodes.
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- auto *ScalarTy = State.TypeAnalysis.inferScalarType(this);
+ BasicBlock *VectorPH =
+ State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
bool ScalarPHI = State.VF.isScalar() || IsInLoop;
- Type *VecTy = ScalarPHI ? ScalarTy : VectorType::get(ScalarTy, VF);
+ Value *StartV = State.get(StartVPV, ScalarPHI);
+ Type *VecTy = StartV->getType();
BasicBlock *HeaderBB = State.CFG.PrevBB;
assert(State.CurrentParentLoop->getHeader() == HeaderBB &&
@@ -3862,49 +3884,7 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
Phi->insertBefore(HeaderBB->getFirstInsertionPt());
State.set(this, Phi, IsInLoop);
- BasicBlock *VectorPH =
- State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
- // Create start and identity vector values for the reduction in the preheader.
- // TODO: Introduce recipes in VPlan preheader to create initial values.
- IRBuilderBase::InsertPointGuard IPBuilder(State.Builder);
- State.Builder.SetInsertPoint(VectorPH->getTerminator());
-
- // Reductions do not have to start at zero. They can start with
- // any loop invariant values.
- VPValue *StartVPV = getStartValue();
- RecurKind RK = RdxDesc.getRecurrenceKind();
- if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
- RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
- RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
- // [I|F]FindLastIV will use a sentinel value to initialize the reduction
- // phi or the resume value from the main vector loop when vectorizing the
- // epilogue loop. In the exit block, ComputeReductionResult will generate
- // checks to verify if the reduction result is the sentinel value. If the
- // result is the sentinel value, it will be corrected back to the start
- // value.
- // TODO: The sentinel value is not always necessary. When the start value is
- // a constant, and smaller than the start value of the induction variable,
- // the start value can be directly used to initialize the reduction phi.
- Phi->addIncoming(State.get(StartVPV, ScalarPHI), VectorPH);
- return;
- }
-
- Value *Iden = getRecurrenceIdentity(RK, VecTy->getScalarType(),
- RdxDesc.getFastMathFlags());
- unsigned CurrentPart = getUnrollPart(*this);
- Value *StartV = StartVPV->getLiveInIRValue();
- if (!ScalarPHI) {
- if (CurrentPart == 0) {
- Iden = State.Builder.CreateVectorSplat(VF, Iden);
- Constant *Zero = State.Builder.getInt32(0);
- StartV = State.Builder.CreateInsertElement(Iden, StartV, Zero);
- } else {
- Iden = State.Builder.CreateVectorSplat(VF, Iden);
- }
- }
-
- Value *StartVal = (CurrentPart == 0) ? StartV : Iden;
- Phi->addIncoming(StartVal, VectorPH);
+ Phi->addIncoming(StartV, VectorPH);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 5c8849be3d23e..354dac84a0c83 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1151,6 +1151,22 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
R.setOperand(0, Y);
return;
}
+
+ auto *Plan = R.getParent()->getPlan();
+ if (!Plan->isUnrolled())
+ return;
+ /// Simplify redundant ReductionStartVector recipes after unrolling.
+ VPValue *StartV;
+ if (match(&R, m_VPInstruction<VPInstruction::ReductionStartVector>(
+ m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
+ R.getVPSingleValue()->replaceUsesWithIf(
+ StartV, [&R](const VPUser &U, unsigned Idx) {
+ auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
+ return PhiR && R.getVPSingleValue() == PhiR->getOperand(Idx) &&
+ PhiR->isInLoop();
+ });
+ return;
+ }
}
void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index e1fb3d476c58d..3a48feb8c4ef2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -223,6 +223,20 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
Copy->addOperand(R);
Copy->addOperand(getConstantVPV(Part));
} else if (RdxPhi) {
+ // If the start value is a ReductionStartVector, use the identity value (second operand) for unrolled parts. If the scaling factor is > 1, create a new ReductionStartVector with the scale factor and both operands set to the identity value.
+ if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) {
+ if (cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue())
+ ->getZExtValue() == 1)
+ Copy->setOperand(0, VPI->getOperand(1));
+ else {
+ if (Part == 1) {
+ auto *C = VPI->clone();
+ C->setOperand(0, C->getOperand(1));
+ C->insertAfter(VPI);
+ addUniformForAllParts(C);
+ }
+ }
+ }
Copy->addOperand(getConstantVPV(Part));
} else {
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 0e5e785a94636..c3fc91c4574f1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -161,8 +161,8 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[IDX_NEG]], [[N_VEC5]]
-; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[IV]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index f1947dec2ea23..b4987127a513d 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -153,10 +153,10 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
; CHECK-NEXT: [[N_MOD_VF24:%.*]] = urem i64 [[TMP2]], 2
; CHECK-NEXT: [[N_VEC25:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF24]]
; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC25]]
+; CHECK-NEXT: [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], <i64 0, i64 1>
-; CHECK-NEXT: [[TMP57:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT32:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 01a7ea4ffcd05..3f17c95f7ca95 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -277,9 +277,9 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -581,8 +581,8 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
@@ -771,8 +771,8 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) {
; NO-VP-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; NO-VP-OUTLOOP-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; NO-VP-OUTLOOP-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; NO-VP-OUTLOOP-NEXT: [[TMP11:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[START]], i32 0
+; NO-VP-OUTLOOP-NEXT: [[TMP12:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul <vscale x 4 x i32> [[TMP12]], splat (i32 1)
; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP14]]
; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP10]] to i32
diff --...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
aefed54
to
d7bfe5b
Compare
Replace VPReductionPHIRecipe with RecurKind in VPReductionPHIRecipe, as all VPlan analyses and codegen only require the recurrence kind. This enables creating new VPReductionPHIRecipe directly in LV, without needing to construction a whole RecurrenceDescriptor object. Depends on llvm#141860 llvm#141932 llvm#142290 llvm#142291
d7bfe5b
to
2c3c96d
Compare
@@ -8286,6 +8289,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, | |||
// If the PHI is used by a partial reduction, set the scale factor. | |||
unsigned ScaleFactor = | |||
getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Redundant blank
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stripped, thanks!
case VPInstruction::ReductionStartVector: | ||
return inferScalarType(R->getOperand(0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code matches Instruction::ExtractElement and Instruction::Freeze, maybe we can combine them to avoid duplication?
m_VPValue(StartV), m_VPValue(), m_VPValue()))) { | ||
Def->replaceUsesWithIf(StartV, [Def](const VPUser &U, unsigned Idx) { | ||
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U); | ||
return PhiR && Def == PhiR->getOperand(Idx) && PhiR->isInLoop(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why need Def == PhiR->getOperand(Idx)
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep that's not needed in the latest version, removed thanks!
if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) { | ||
if (cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) | ||
->getZExtValue() == 1) | ||
Copy->setOperand(0, VPI->getOperand(1)); | ||
else { | ||
if (Part == 1) { | ||
auto *C = VPI->clone(); | ||
C->setOperand(0, C->getOperand(1)); | ||
C->insertAfter(VPI); | ||
addUniformForAllParts(C); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Some format suggestions.
if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) { | |
if (cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) | |
->getZExtValue() == 1) | |
Copy->setOperand(0, VPI->getOperand(1)); | |
else { | |
if (Part == 1) { | |
auto *C = VPI->clone(); | |
C->setOperand(0, C->getOperand(1)); | |
C->insertAfter(VPI); | |
addUniformForAllParts(C); | |
} | |
} | |
} | |
if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) | |
if (cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) | |
->getZExtValue() == 1) { | |
Copy->setOperand(0, VPI->getOperand(1)); | |
} else if (Part == 1) { | |
auto *C = VPI->clone(); | |
C->setOperand(0, C->getOperand(1)); | |
C->insertAfter(VPI); | |
addUniformForAllParts(C); | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
Replace VPReductionPHIRecipe with RecurKind in VPReductionPHIRecipe, as all VPlan analyses and codegen only require the recurrence kind. This enables creating new VPReductionPHIRecipe directly in LV, without needing to construction a whole RecurrenceDescriptor object. Depends on llvm#141860 llvm#141932 llvm#142290 llvm#142291
Replace VPReductionPHIRecipe with RecurKind in VPReductionPHIRecipe, as all VPlan analyses and codegen only require the recurrence kind. This enables creating new VPReductionPHIRecipe directly in LV, without needing to construction a whole RecurrenceDescriptor object. Depends on llvm#141860 llvm#141932 llvm#142290 llvm#142291
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that ReductionStartVector will need to be handled in VPlanAnalysis::getVFScaleFactor so that its register usage isn't overestimated.
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) | ||
.value_or(1); | ||
Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); | ||
auto *ScalarFactorVPV = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Scalar -> Scale
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks!
// If this phi is fed by a scaled reduction then it should output a | ||
// vector with fewer elements than the VF. | ||
ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this no longer needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope, both the start value and the backedge value now are narrowed already.
Add a new VPInstruction::ReductionStartVector opcode to create the start values for wide reductions. This more accurately models the start value creation in VPlan and simplifies VPReductionPHIRecipe::execute.
2c3c96d
to
ab43958
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think that ReductionStartVector will need to be handled in VPlanAnalysis::getVFScaleFactor so that its register usage isn't overestimated.
Thanks, I think for now we never try to get the scaling factor as it is produced outside the vector loop. I added an assert for now
@@ -8286,6 +8289,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, | |||
// If the PHI is used by a partial reduction, set the scale factor. | |||
unsigned ScaleFactor = | |||
getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); | |||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stripped, thanks!
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr()) | ||
.value_or(1); | ||
Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext()); | ||
auto *ScalarFactorVPV = |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed, thanks!
// If this phi is fed by a scaled reduction then it should output a | ||
// vector with fewer elements than the VF. | ||
ElementCount VF = State.VF.divideCoefficientBy(VFScaleFactor); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope, both the start value and the backedge value now are narrowed already.
m_VPValue(StartV), m_VPValue(), m_VPValue()))) { | ||
Def->replaceUsesWithIf(StartV, [Def](const VPUser &U, unsigned Idx) { | ||
auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U); | ||
return PhiR && Def == PhiR->getOperand(Idx) && PhiR->isInLoop(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep that's not needed in the latest version, removed thanks!
if (auto *VPI = dyn_cast<VPInstruction>(RdxPhi->getStartValue())) { | ||
if (cast<ConstantInt>(VPI->getOperand(2)->getLiveInIRValue()) | ||
->getZExtValue() == 1) | ||
Copy->setOperand(0, VPI->getOperand(1)); | ||
else { | ||
if (Part == 1) { | ||
auto *C = VPI->clone(); | ||
C->setOperand(0, C->getOperand(1)); | ||
C->insertAfter(VPI); | ||
addUniformForAllParts(C); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done thanks
Replace VPReductionPHIRecipe with RecurKind in VPReductionPHIRecipe, as all VPlan analyses and codegen only require the recurrence kind. This enables creating new VPReductionPHIRecipe directly in LV, without needing to construction a whole RecurrenceDescriptor object. Depends on llvm#141860 llvm#141932 llvm#142290 llvm#142291
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, looks good to me!
Add a new VPInstruction::ReductionStartVector opcode to create the start values for wide reductions. This more accurately models the start value creation in VPlan and simplifies VPReductionPHIRecipe::execute. Down the line it also allows removing VPReductionPHIRecipe::RdxDesc.