Skip to content

[VPlan] Model FOR extract of exit value in VPlan. #93395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 0 additions & 38 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3536,44 +3536,6 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
Builder.CreateExtractElement(Incoming, LastIdx, "vector.recur.extract");
}

auto RecurSplice = cast<VPInstruction>(*PhiR->user_begin());
assert(PhiR->getNumUsers() == 1 &&
RecurSplice->getOpcode() ==
VPInstruction::FirstOrderRecurrenceSplice &&
"recurrence phi must have a single user: FirstOrderRecurrenceSplice");
SmallVector<VPLiveOut *> LiveOuts;
for (VPUser *U : RecurSplice->users())
if (auto *LiveOut = dyn_cast<VPLiveOut>(U))
LiveOuts.push_back(LiveOut);

if (!LiveOuts.empty()) {
// Extract the second last element in the middle block if the
// Phi is used outside the loop. We need to extract the phi itself
// and not the last element (the phi update in the current iteration). This
// will be the value when jumping to the exit block from the
// LoopMiddleBlock, when the scalar loop is not run at all.
Value *ExtractForPhiUsedOutsideLoop = nullptr;
if (VF.isVector()) {
auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
Incoming, Idx, "vector.recur.extract.for.phi");
} else {
assert(UF > 1 && "VF and UF cannot both be 1");
// When loop is unrolled without vectorizing, initialize
// ExtractForPhiUsedOutsideLoop with the value just prior to unrolled
// value of `Incoming`. This is analogous to the vectorized case above:
// extracting the second last element when VF > 1.
ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
}

for (VPLiveOut *LiveOut : LiveOuts) {
assert(!Cost->requiresScalarEpilogue(VF.isVector()));
PHINode *LCSSAPhi = LiveOut->getPhi();
LCSSAPhi->addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
State.Plan->removeLiveOut(LCSSAPhi);
}
}

// Fix the initial value of the original recurrence in the scalar loop.
Builder.SetInsertPoint(LoopScalarPreHeader, LoopScalarPreHeader->begin());
PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
Expand Down
22 changes: 19 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,10 @@ class VPLane {

static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }

static VPLane getLastLaneForVF(const ElementCount &VF) {
unsigned LaneOffset = VF.getKnownMinValue() - 1;
static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works "horizontally" for Offsets between 1 and VF, inclusive, if VF >= 2,
and "vertically" for Offsets between 1 and UF, inclusive, if VF = 1, implying UF >= 2.
I.e., this works in general only for Offset = 1 and Offset = 2 across possible {VF, UF} combinations.
May be good to keep private and expose only public last and penultimate cases. And/or assert.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be worth supporting any lane w.r.t. to he constraints that it is <= VF in the vector case or <= UF for the scalar case. Added an assert for now. This also avoids explicitly checking the lane and delegating to multiple helper functions.

assert(Offset > 0 && Offset <= VF.getKnownMinValue() &&
"trying to extract with invalid offset");
unsigned LaneOffset = VF.getKnownMinValue() - Offset;
Kind LaneKind;
if (VF.isScalable())
// In this case 'LaneOffset' refers to the offset from the start of the
Expand All @@ -179,6 +181,10 @@ class VPLane {
return VPLane(LaneOffset, LaneKind);
}

static VPLane getLastLaneForVF(const ElementCount &VF) {
return getLaneFromEnd(VF, 1);
}

/// Returns a compile-time known value for the lane index and asserts if the
/// lane can only be calculated at runtime.
unsigned getKnownLane() const {
Expand Down Expand Up @@ -1182,6 +1188,12 @@ class VPInstruction : public VPRecipeWithIRFlags {
BranchOnCount,
BranchOnCond,
ComputeReductionResult,
// Takes the VPValue to extract from as first operand and the lane or part
// to extract as second operand, counting from the end starting with 1 for
// last. The second operand must be a positive constant and <= VF when
// extracting from a vector or <= UF when extracting from an unrolled
// scalar.
ExtractFromEnd,
LogicalAnd, // Non-poison propagating logical And.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
Expand Down Expand Up @@ -1327,6 +1339,10 @@ class VPInstruction : public VPRecipeWithIRFlags {
};
llvm_unreachable("switch should return");
}

/// Returns true if this VPInstruction produces a scalar value from a vector,
/// e.g. by performing a reduction or extracting a lane.
bool isVectorToScalar() const;
};

/// VPWidenRecipe is a recipe for producing a copy of vector type its
Expand Down Expand Up @@ -3657,7 +3673,7 @@ inline bool isUniformAfterVectorization(VPValue *VPV) {
if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def))
return all_of(GEP->operands(), isUniformAfterVectorization);
if (auto *VPI = dyn_cast<VPInstruction>(Def))
return VPI->getOpcode() == VPInstruction::ComputeReductionResult;
return VPI->isVectorToScalar();
return false;
}
} // end namespace vputils
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
CachedTypes[OtherV] = ResTy;
return ResTy;
}
case VPInstruction::ExtractFromEnd: {
Type *BaseTy = inferScalarType(R->getOperand(0));
if (auto *VecTy = dyn_cast<VectorType>(BaseTy))
return VecTy->getElementType();
return BaseTy;
}
case VPInstruction::Not: {
Type *ResTy = inferScalarType(R->getOperand(0));
assert(IntegerType::get(Ctx, 1) == ResTy &&
Expand Down
39 changes: 35 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInstruction::Not:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExtractFromEnd:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed in this patch, given the LiveOut user, or only for the follow-up patch extracting the last element?
Worth a TODO? Temporarily to keep alive until VPUsers are introduced.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is needed for the patch, as it always adds the extract and RAUW outside the loop, relying on DCE to clean up unneeded extracts.

Copy link
Collaborator

@ayalz ayalz Jun 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, this marks ExtractFromEnd as not having side effects.

case VPInstruction::LogicalAnd:
case VPInstruction::PtrAdd:
return false;
Expand Down Expand Up @@ -293,13 +294,13 @@ bool VPInstruction::doesGeneratePerAllLanes() const {
bool VPInstruction::canGenerateScalarForFirstLane() const {
if (Instruction::isBinaryOp(getOpcode()))
return true;

if (isVectorToScalar())
return true;
switch (Opcode) {
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ComputeReductionResult:
case VPInstruction::PtrAdd:
case VPInstruction::ExplicitVectorLength:
return true;
Expand Down Expand Up @@ -558,6 +559,29 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {

return ReducedPartRdx;
}
case VPInstruction::ExtractFromEnd: {
if (Part != 0)
return State.get(this, 0, /*IsScalar*/ true);

auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Document somewhere that 2nd operand of ExtractFromEnd must be a constant, either 1 or 2.

This patch exercises Offset = 2 and follow-up #93396 exercises Offset = 1.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment to where the opcode is defined, thanks. I kept the code general for now, with the constraints that the lane to extract must be <= VF when vectorizing or <= UF otherwise.

unsigned Offset = CI->getZExtValue();
assert(Offset > 0 && "Offset from end must be positive");
Value *Res;
if (State.VF.isVector()) {
assert(Offset <= State.VF.getKnownMinValue() &&
"invalid offset to extract from");
// Extract lane VF - Offset from the operand.
Res = State.get(
getOperand(0),
VPIteration(State.UF - 1, VPLane::getLaneFromEnd(State.VF, Offset)));
} else {
assert(Offset <= State.UF && "invalid offset to extract from");
// When loop is unrolled without vectorizing, retrieve UF - Offset.
Res = State.get(getOperand(0), State.UF - Offset);
}
Res->setName(Name);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Name was found useless and destined to removal some patches ago: #81411 (review)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used to retain the original name for the IR values, which still seems useful to keep the IR readable and reduce the test changes (in particular it makes it easier to trace back to where the IR instruction is coming from)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those are available via Underlying Values? This Name was originally designed to facilitate new names, and as such appear redundant. (Issue independent of this patch.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is for cases where there is no underlying value, which is the case for this opcode and (most?) custom VPInstruction opcodes.

return Res;
}
case VPInstruction::LogicalAnd: {
Value *A = State.get(getOperand(0), Part);
Value *B = State.get(getOperand(1), Part);
Expand All @@ -575,6 +599,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
}
}

bool VPInstruction::isVectorToScalar() const {
return getOpcode() == VPInstruction::ExtractFromEnd ||
getOpcode() == VPInstruction::ComputeReductionResult;
}

#if !defined(NDEBUG)
bool VPInstruction::isFPMathOp() const {
// Inspired by FPMathOperator::classof. Notable differences are that we don't
Expand All @@ -597,8 +626,7 @@ void VPInstruction::execute(VPTransformState &State) {
State.setDebugLocFrom(getDebugLoc());
bool GeneratesPerFirstLaneOnly =
canGenerateScalarForFirstLane() &&
(vputils::onlyFirstLaneUsed(this) ||
getOpcode() == VPInstruction::ComputeReductionResult);
(vputils::onlyFirstLaneUsed(this) || isVectorToScalar());
bool GeneratesPerAllLanes = doesGeneratePerAllLanes();
for (unsigned Part = 0; Part < State.UF; ++Part) {
if (GeneratesPerAllLanes) {
Expand Down Expand Up @@ -692,6 +720,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
case VPInstruction::BranchOnCount:
O << "branch-on-count";
break;
case VPInstruction::ExtractFromEnd:
O << "extract-from-end";
break;
case VPInstruction::ComputeReductionResult:
O << "compute-reduction-result";
break;
Expand Down
22 changes: 17 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -802,7 +802,7 @@ sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR,
}

bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
VPBuilder &Builder) {
VPBuilder &LoopBuilder) {
VPDominatorTree VPDT;
VPDT.recalculate(Plan);

Expand All @@ -812,6 +812,8 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
RecurrencePhis.push_back(FOR);

VPBuilder MiddleBuilder(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth renaming Builder here to, say, LoopBuilder?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor()));
for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
SmallPtrSet<VPFirstOrderRecurrencePHIRecipe *, 4> SeenPhis;
VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
Expand All @@ -831,18 +833,28 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
// fixed-order recurrence.
VPBasicBlock *InsertBlock = Previous->getParent();
if (isa<VPHeaderPHIRecipe>(Previous))
Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
else
Builder.setInsertPoint(InsertBlock, std::next(Previous->getIterator()));
LoopBuilder.setInsertPoint(InsertBlock,
std::next(Previous->getIterator()));

auto *RecurSplice = cast<VPInstruction>(
Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
{FOR, FOR->getBackedgeValue()}));
LoopBuilder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
{FOR, FOR->getBackedgeValue()}));

FOR->replaceAllUsesWith(RecurSplice);
// Set the first operand of RecurSplice to FOR again, after replacing
// all users.
RecurSplice->setOperand(0, FOR);

Type *IntTy = Plan.getCanonicalIV()->getScalarType();
auto *Result = cast<VPInstruction>(MiddleBuilder.createNaryOp(
VPInstruction::ExtractFromEnd,
{FOR->getBackedgeValue(),
Plan.getOrAddLiveIn(ConstantInt::get(IntTy, 2))},
{}, "vector.recur.extract.for.phi"));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, this is the unique Name assigned to this VPInstruction instance.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

RecurSplice->replaceUsesWithIf(
Result, [](VPUser &U, unsigned) { return isa<VPLiveOut>(&U); });
}
return true;
}
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
Expand Down Expand Up @@ -191,8 +191,8 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
Expand Down Expand Up @@ -332,9 +332,9 @@ define i64 @test_ptr_ivs_and_widened_ivs(ptr %src, i32 %N) {
; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; DEFAULT: middle.block:
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP15]], i32 3
; DEFAULT-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP15]], i32 2
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT: scalar.ph:
; DEFAULT-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -870,9 +870,9 @@ define i8 @add_phifail2(ptr noalias nocapture readonly %p, ptr noalias nocapture
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <16 x i32> [[TMP6]], i32 14
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
Expand Down
Loading
Loading