-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[VPlan] Introduce explicit broadcasts for live-ins. #124644
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesAdd a new VPInstruction::Broadcast opcode and use it to materialize explicit broadcasts of live-ins. The initial patch only materlizes the broadcasts if the vector preheader dominates all uses that need it. Later patches will pick the best valid insert point, thus retiring implicit hoisting of broadcasts from VPTransformsState::get(). Patch is 153.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124644.diff 35 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f592e5557c17d5..a5fa304b7631e6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7648,7 +7648,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
((VectorizingEpilogue && ExpandedSCEVs) ||
(!VectorizingEpilogue && !ExpandedSCEVs)) &&
"expanded SCEVs to reuse can only be used during epilogue vectorization");
-
+ VPlanTransforms::materializeBroadcasts(BestVPlan);
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 253f22d299b623..2b3319250673de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1210,6 +1210,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
CanonicalIVIncrementForPart,
BranchOnCount,
BranchOnCond,
+ Broadcast,
ComputeReductionResult,
// Takes the VPValue to extract from as first operand and the lane or part
// to extract as second operand, counting from the end starting with 1 for
@@ -1855,6 +1856,13 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags {
bool isInvariantCond() const {
return getCond()->isDefinedOutsideLoopRegions();
}
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getCond() && isInvariantCond();
+ }
};
/// A recipe for handling GEP instructions.
@@ -1902,6 +1910,13 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getOperand(0);
+ }
};
/// A recipe to compute the pointers for widened memory accesses of IndexTy
@@ -2217,6 +2232,13 @@ class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe {
VPValue *getLastUnrolledPartOperand() {
return getNumOperands() == 5 ? getOperand(4) : this;
}
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getStartValue();
+ }
};
class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
@@ -2249,6 +2271,13 @@ class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe,
/// Returns true if only scalar values will be generated.
bool onlyScalarsGenerated(bool IsScalable);
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getOperand(0);
+ }
+
/// Returns the VPValue representing the value of this induction at
/// the first unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
@@ -2377,6 +2406,13 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getStartValue();
+ }
};
/// A recipe for handling reduction phis. The start value is the first operand
@@ -2443,6 +2479,13 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
/// Returns true, if the phi is part of an in-loop reduction.
bool isInLoop() const { return IsInLoop; }
+
+ /// Returns true if the recipe only uses the first lane of operand \p Op.
+ bool onlyFirstLaneUsed(const VPValue *Op) const override {
+ assert(is_contained(operands(), Op) &&
+ "Op must be an operand of the recipe");
+ return Op == getStartValue();
+ }
};
/// A recipe for forming partial reductions. In the loop, an accumulator and
@@ -4054,6 +4097,8 @@ class VPlan {
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); }
+ ArrayRef<VPValue *> getLiveIns() const { return VPLiveInsToFree; }
+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the live-ins of this VPlan to \p O.
void printLiveIns(raw_ostream &O) const;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 27357ff04b5f2a..1408f10b23e1e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -89,6 +89,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
inferScalarType(R->getOperand(1))->isIntegerTy(1) &&
"LogicalAnd operands should be bool");
return IntegerType::get(Ctx, 1);
+ case VPInstruction::Broadcast:
case VPInstruction::PtrAdd:
// Return the type based on the pointer argument (i.e. first operand).
return inferScalarType(R->getOperand(0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2679ed6b26b5d1..ec442ea201e6d3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -575,6 +575,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
return CondBr;
}
+ case VPInstruction::Broadcast: {
+ return Builder.CreateVectorSplat(
+ State.VF, State.get(getOperand(0), /*IsScalar*/ true), "broadcast");
+ }
case VPInstruction::ComputeReductionResult: {
// FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
// and will be removed by breaking up the recipe further.
@@ -790,7 +794,6 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case Instruction::ICmp:
case Instruction::Select:
case Instruction::Or:
- case VPInstruction::PtrAdd:
// TODO: Cover additional opcodes.
return vputils::onlyFirstLaneUsed(this);
case VPInstruction::ActiveLaneMask:
@@ -801,6 +804,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
case VPInstruction::BranchOnCond:
case VPInstruction::ResumePhi:
return true;
+ case VPInstruction::PtrAdd:
+ return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this);
};
llvm_unreachable("switch should return");
}
@@ -873,6 +878,10 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
+ case VPInstruction::Broadcast:
+ O << "broadcast";
+ break;
+
case VPInstruction::ExtractFromEnd:
O << "extract-from-end";
break;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 714250a56ff576..0ca0b864ef053c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2146,3 +2146,33 @@ bool VPlanTransforms::handleUncountableEarlyExit(
LatchExitingBranch->eraseFromParent();
return true;
}
+
+void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
+ VPDominatorTree VPDT;
+ VPDT.recalculate(Plan);
+ auto *VectorPreheader = Plan.getVectorPreheader();
+ VPBuilder Builder(VectorPreheader);
+ for (VPValue *LiveIn : Plan.getLiveIns()) {
+ if (all_of(LiveIn->users(),
+ [LiveIn](VPUser *U) {
+ return cast<VPRecipeBase>(U)->usesScalars(LiveIn);
+ }) ||
+ !LiveIn->getLiveInIRValue() ||
+ isa<Constant>(LiveIn->getLiveInIRValue()))
+ continue;
+
+ // Add explicit broadcast if the vector preheader dominates all users.
+ // TODO: Find valid inert point for all users.
+ if (all_of(LiveIn->users(), [&VPDT, VectorPreheader](VPUser *U) {
+ return VectorPreheader != cast<VPRecipeBase>(U)->getParent() &&
+ VPDT.dominates(VectorPreheader,
+ cast<VPRecipeBase>(U)->getParent());
+ })) {
+ auto *Broadcast =
+ Builder.createNaryOp(VPInstruction::Broadcast, {LiveIn});
+ LiveIn->replaceUsesWithIf(Broadcast, [LiveIn, Broadcast](VPUser &U, unsigned Idx) {
+ return Broadcast != &U && !cast<VPRecipeBase>(&U)->usesScalars(LiveIn);
+ });
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index b31fef5d62456b..f749e2f0a81b86 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -144,6 +144,9 @@ struct VPlanTransforms {
static void
optimizeInductionExitUsers(VPlan &Plan,
DenseMap<VPValue *, VPValue *> &EndValues);
+
+ /// Add explicit broadcasts for live-ins used as vectors.
+ static void materializeBroadcasts(VPlan &Plan);
};
} // namespace llvm
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
index 5b77ced73bce0a..10300982bd9f4c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll
@@ -17,14 +17,14 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -103,14 +103,14 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i64> @llvm.stepvector.nxv8i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 8 x i64> [[TMP8]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP7]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP6]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
index 2c37593be78612..6f3d49d442f1e6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -124,14 +124,14 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i
; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[N]], [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP9]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[M]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -248,13 +248,13 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i64 [[TMP0]], [[TMP11]]
; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[TMP0]])
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP15:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP17:%.*]] = mul <vscale x 2 x i64> [[TMP15]], splat (i64 1)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP9]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[MUL_2_I]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP20]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
@@ -262,7 +262,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT: [[TMP23:%.*]] = udiv <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP24:%.*]] = urem i64 [[TMP21]], [[MUL_2_I]]
; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[MUL_1_I]]
; CHECK-NEXT: [[TMP26:%.*]] = urem i64 [[TMP24]], [[MUL_1_I]]
@@ -283,7 +283,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP14]])
; CHECK-NEXT: [[TMP47:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
; CHECK-NEXT: [[TMP48:%.*]] = extractelement <vscale x 2 x i1> [[TMP47]], i32 0
; CHECK-NEXT: br i1 [[TMP48]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index bf956227334616..350ad5b3e54330 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -106,15 +106,15 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], splat (i64 1)
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADC...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
2aaf89c
to
f4174c0
Compare
f4174c0
to
b33dcb3
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ping :)
@@ -3719,6 +3762,8 @@ class VPlan { | |||
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. | |||
VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } | |||
|
|||
ArrayRef<VPValue *> getLiveIns() const { return VPLiveInsToFree; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The name looks confusingly similar to getLiveIn
above, but they refer to different things. getLiveIn
lookups a value from Value2VPValue
and getLiveIns
returns the list VPLiveInsToFree
. Should this be called something like getLiveInsToFree
?
Also, perhaps worth adding a comment to the function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I added a comment to the function.
At the moment, VPLiveInsToFree
contains all the VPValues in Value2VPValue (that wasn't always the case). It provides an easy and deterministic way to get all created live-ins. The comment for VPLiveInsToFree
should already be clear as to what it contains, but the name may be a bit confusing, especially with the new use. I updated it to VPLiveIns
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@david-arm WDYT about the latest naming? I'd be planning to land this soon and potentially iterate on the way live-ins are stored separately if needed.
(edit: realized there's a newer comment which I also replied to below)
continue; | ||
|
||
// Add explicit broadcast if the vector preheader dominates all users. | ||
// TODO: Find valid inert point for all users. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I think this should be valid insert point
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you! This is a great first step for explicit splat.
LGTM if the comments from @david-arm is addressed.
if (all_of(LiveIn->users(), | ||
[LiveIn](VPUser *U) { | ||
return cast<VPRecipeBase>(U)->usesScalars(LiveIn); | ||
}) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I saw that this patch adds many implementations of onlyFirstLaneUsed
. Is this for the correctness of useScalars
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it is to avoid introducing broadcasts that then won't be used.
b33dcb3
to
ad66ff7
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Post approval comments
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
return Op == getOperand(0); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return Op == getOperand(0); | |
return Op == getOperand(0) && isPointerLoopInvariant(); |
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated, thanks!
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
return Op == getStartValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this be the default for all VPHeaderPhiRecipes?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think so at the moment, VPWidenIntOrFpInductionRecipe is special in a way, it still produces its own vector start value in the preheader. To be cleaned up as followup?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can leave behind a TODO.
@@ -90,6 +90,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { | |||
inferScalarType(R->getOperand(1))->isIntegerTy(1) && | |||
"LogicalAnd operands should be bool"); | |||
return IntegerType::get(Ctx, 1); | |||
case VPInstruction::Broadcast: | |||
case VPInstruction::PtrAdd: | |||
// Return the type based on the pointer argument (i.e. first operand). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update comment - first operand of Broadcast may not be a pointer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks
@@ -825,6 +828,8 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { | |||
case VPInstruction::BranchOnCond: | |||
case VPInstruction::ResumePhi: | |||
return true; | |||
case VPInstruction::PtrAdd: | |||
return Op == getOperand(0) || vputils::onlyFirstLaneUsed(this); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why restrict to first operand?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The pointer operand will always use the first lane, the index may use all lanes if other lanes are used.
@@ -2154,3 +2154,34 @@ void VPlanTransforms::handleUncountableEarlyExit( | |||
Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); | |||
LatchExitingBranch->eraseFromParent(); | |||
} | |||
|
|||
void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
void VPlanTransforms::materializeBroadcasts(VPlan &Plan) { | |
void VPlanTransforms::materializeLiveInBroadcasts(VPlan &Plan) { |
@@ -3716,6 +3759,9 @@ class VPlan { | |||
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. | |||
VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } | |||
|
|||
/// Return the list of live-in VPValues available in the VPlan. | |||
ArrayRef<VPValue *> getLiveIns() const { return VPLiveIns; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, I still feel this could lead to confusion/problems though. If I understand correctly, VPLiveIns
is essentially a superset of Value2VPValue
? For example, any VPValue contained in the map Value2VPValue
should also exist in VPLiveIns
? Unless live-ins get deleted from VPLiveIns
or re-added?
Essentially it sounds like getLiveIn
should really be called something like findLiveInForValue
or something like that. And we might want to assert somewhere that the values found in the Value2VPValue
map also exist in VPLiveIns
. I'm just a bit worried that we have two ways of recording the same state and there might not be anything to defend them being kept in sync?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I added an assert to getLiveIns()
to make sure all VPValues also in Value2VPValue
are included in VPLiveIns
. At the moment that's guaranteed, as getOrAddLiveIn
is the only place that adds to Value2VPValue
and VPLiveIns
. Live-ins are only destroyed when the VPlan is destroyed, so at the moment nothing should remove from either VPLiveIns
or Value2VPValue
.
I left the name of getLiveIn
unchanged for now here as that should probably done separately, although the ForValue
part is implied by the argument type. WDYT?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@david-arm WDYT about the latest naming? I'd be planning to land this soon and potentially iterate on the way live-ins are stored separately if needed.
344b4af
to
386dc5b
Compare
Extra test for llvm#124644.
if (Plan.hasScalarVFOnly()) | ||
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: early exit earlier.
@@ -3522,7 +3565,7 @@ class VPlan { | |||
|
|||
/// Contains all the external definitions created for this VPlan. External | |||
/// definitions are VPValues that hold a pointer to their underlying IR. | |||
SmallVector<VPValue *, 16> VPLiveInsToFree; | |||
SmallVector<VPValue *, 16> VPLiveIns; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems redundant to hold all live-ins in both VPLiveIns SmallVector and above Value2VPValue map?
The latter could be renamed Value2VPLiveIn
, although VPLiveIn
's are actually VPValue
's.
@@ -3726,6 +3769,16 @@ class VPlan { | |||
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise. | |||
VPValue *getLiveIn(Value *V) const { return Value2VPValue.lookup(V); } | |||
|
|||
/// Return the list of live-in VPValues available in the VPlan. | |||
ArrayRef<VPValue *> getLiveIns() const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The list of live-in VPValues can be produced from the ValueToVPValue map on request.
Users internal to VPlan (destructor, duplicate()) can alternatively scan the map directly.
bool onlyFirstLaneUsed(const VPValue *Op) const override { | ||
assert(is_contained(operands(), Op) && | ||
"Op must be an operand of the recipe"); | ||
return Op == getStartValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can leave behind a TODO.
386dc5b
to
6d0c84d
Compare
Add a new VPInstruction::Broadcast opcode and use it to materialize explicit broadcasts of live-ins. The initial patch only materlizes the broadcasts if the vector preheader dominates all uses that need it. Later patches will pick the best valid insert point, thus retiring implicit hoisting of broadcasts from VPTransformsState::get().
6d0c84d
to
789eedc
Compare
Add a new VPInstruction::Broadcast opcode and use it to materialize explicit broadcasts of live-ins. The initial patch only materlizes the broadcasts if the vector preheader dominates all uses that need it. Later patches will pick the best valid insert point, thus retiring implicit hoisting of broadcasts from VPTransformsState::get(). PR: llvm/llvm-project#124644
case VPInstruction::Broadcast: | ||
O << "broadcast"; | ||
break; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: redundant inconsistent empty line.
@@ -7697,7 +7697,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan( | |||
((VectorizingEpilogue && ExpandedSCEVs) || | |||
(!VectorizingEpilogue && !ExpandedSCEVs)) && | |||
"expanded SCEVs to reuse can only be used during epilogue vectorization"); | |||
|
|||
VPlanTransforms::materializeLiveInBroadcasts(BestVPlan); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
runPass()?
Add a new VPInstruction::Broadcast opcode and use it to materialize explicit broadcasts of live-ins. The initial patch only materlizes the broadcasts if the vector preheader dominates all uses that need it. Later patches will pick the best valid insert point, thus retiring implicit hoisting of broadcasts from VPTransformsState::get().