Skip to content

Commit 8488520

Browse files
committed
Introduce ExplicitVectorLengthMask recipe for out-loop reduction.
1 parent 28dd35c commit 8488520

11 files changed

+497
-581
lines changed

llvm/lib/Transforms/Vectorize/VPlan.h

+1
Original file line numberDiff line numberDiff line change
@@ -1183,6 +1183,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
11831183
SLPStore,
11841184
ActiveLaneMask,
11851185
ExplicitVectorLength,
1186+
ExplicitVectorLengthMask,
11861187
CalculateTripCountMinusVF,
11871188
// Increment the canonical IV separately for each unrolled part.
11881189
CanonicalIVIncrementForPart,

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+13
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
139139
case VPInstruction::Not:
140140
case VPInstruction::CalculateTripCountMinusVF:
141141
case VPInstruction::CanonicalIVIncrementForPart:
142+
case VPInstruction::ExplicitVectorLengthMask:
142143
case VPInstruction::ExtractFromEnd:
143144
case VPInstruction::FirstOrderRecurrenceSplice:
144145
case VPInstruction::LogicalAnd:
@@ -429,6 +430,14 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
429430
Value *EVL = GetEVL(State, AVL);
430431
return EVL;
431432
}
433+
case VPInstruction::ExplicitVectorLengthMask: {
434+
assert(Part == 0 && "No unrolling expected for predicated vectorization.");
435+
// Compute step < splat(evl)
436+
Value *EVL = State.get(getOperand(0), VPIteration(0, 0));
437+
Value *SplatEVL = Builder.CreateVectorSplat(State.VF, EVL);
438+
Value *Step = Builder.CreateStepVector(SplatEVL->getType());
439+
return Builder.CreateICmpULT(Step, SplatEVL, "evl.mask");
440+
}
432441
case VPInstruction::CanonicalIVIncrementForPart: {
433442
auto *IV = State.get(getOperand(0), VPIteration(0, 0));
434443
if (Part == 0)
@@ -675,6 +684,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
675684
return vputils::onlyFirstLaneUsed(this);
676685
case VPInstruction::ActiveLaneMask:
677686
case VPInstruction::ExplicitVectorLength:
687+
case VPInstruction::ExplicitVectorLengthMask:
678688
case VPInstruction::CalculateTripCountMinusVF:
679689
case VPInstruction::CanonicalIVIncrementForPart:
680690
case VPInstruction::BranchOnCount:
@@ -733,6 +743,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
733743
case VPInstruction::ExplicitVectorLength:
734744
O << "EXPLICIT-VECTOR-LENGTH";
735745
break;
746+
case VPInstruction::ExplicitVectorLengthMask:
747+
O << "EXPLICIT-VECTOR-LENGTH-MASK";
748+
break;
736749
case VPInstruction::FirstOrderRecurrenceSplice:
737750
O << "first-order splice";
738751
break;

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

+50-34
Original file line numberDiff line numberDiff line change
@@ -1426,6 +1426,23 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
14261426
{EVLPhi, Plan.getTripCount()});
14271427
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
14281428

1429+
// Replace header mask pattern (ICmp::ule widen-canonical-IV, BTC) with a
1430+
// (ICmp::ult step-vector, EVL).
1431+
// TODO: Replace all users of the ExplicitVectorLengthMask recipe with
1432+
// EVL-series recipes wherever possible to ensure the final vplan does not use
1433+
// the mask. The ExplicitVectorLengthMask recipe is a temporary appoarch to
1434+
// handle situations requiring a header mask, such as out-loop (unordered)
1435+
// reductions. It is necessary to generate a mask different from the original
1436+
// header mask because the explict vector length of the second-to-last
1437+
// iteration may be smaller than VF*UF.
1438+
auto *EVLMask =
1439+
new VPInstruction(VPInstruction::ExplicitVectorLengthMask, {VPEVL});
1440+
EVLMask->insertAfter(VPEVL);
1441+
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
1442+
HeaderMask->replaceAllUsesWith(EVLMask);
1443+
recursivelyDeleteDeadRecipes(HeaderMask);
1444+
}
1445+
14291446
auto *CanonicalIVIncrement =
14301447
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
14311448
VPSingleDefRecipe *OpVPEVL = VPEVL;
@@ -1444,45 +1461,44 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
14441461
NextEVLIV->insertBefore(CanonicalIVIncrement);
14451462
EVLPhi->addOperand(NextEVLIV);
14461463

1447-
for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
1448-
for (VPUser *U : collectUsersRecursively(HeaderMask)) {
1449-
VPRecipeBase *NewRecipe = nullptr;
1450-
auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
1451-
if (!CurRecipe || CurRecipe->getNumDefinedValues() > 1)
1452-
continue;
1464+
for (VPUser *U : collectUsersRecursively(EVLMask)) {
1465+
VPRecipeBase *NewRecipe = nullptr;
1466+
auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
1467+
if (!CurRecipe || CurRecipe->getNumDefinedValues() > 1)
1468+
continue;
14531469

1454-
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
1455-
assert(OrigMask && "Unmasked recipe when folding tail");
1456-
return HeaderMask == OrigMask ? nullptr : OrigMask;
1457-
};
1458-
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
1459-
VPValue *NewMask = GetNewMask(MemR->getMask());
1460-
if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
1461-
NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
1462-
else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
1463-
NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
1464-
else
1465-
llvm_unreachable("unsupported recipe");
1466-
} else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
1467-
NewRecipe = new VPReductionEVLRecipe(RedR, VPEVL,
1468-
GetNewMask(RedR->getCondOp()));
1469-
}
1470+
auto GetNewMask = [&](VPValue *OrigMask) -> VPValue * {
1471+
assert(OrigMask && "Unmasked recipe when folding tail");
1472+
return EVLMask == OrigMask ? nullptr : OrigMask;
1473+
};
1474+
if (auto *MemR = dyn_cast<VPWidenMemoryRecipe>(CurRecipe)) {
1475+
VPValue *NewMask = GetNewMask(MemR->getMask());
1476+
if (auto *L = dyn_cast<VPWidenLoadRecipe>(MemR))
1477+
NewRecipe = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask);
1478+
else if (auto *S = dyn_cast<VPWidenStoreRecipe>(MemR))
1479+
NewRecipe = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask);
1480+
else
1481+
llvm_unreachable("unsupported recipe");
1482+
} else if (auto *RedR = dyn_cast<VPReductionRecipe>(CurRecipe)) {
1483+
NewRecipe =
1484+
new VPReductionEVLRecipe(RedR, VPEVL, GetNewMask(RedR->getCondOp()));
1485+
}
14701486

1471-
if (NewRecipe) {
1472-
unsigned NumDefVal = NewRecipe->getNumDefinedValues();
1473-
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
1474-
"New recipe must define the same number of values as the "
1475-
"original.");
1476-
NewRecipe->insertBefore(CurRecipe);
1477-
if (NumDefVal > 0) {
1478-
VPValue *CurVPV = CurRecipe->getVPSingleValue();
1479-
CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
1480-
}
1481-
CurRecipe->eraseFromParent();
1487+
if (NewRecipe) {
1488+
unsigned NumDefVal = NewRecipe->getNumDefinedValues();
1489+
assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
1490+
"New recipe must define the same number of values as the "
1491+
"original.");
1492+
NewRecipe->insertBefore(CurRecipe);
1493+
if (NumDefVal > 0) {
1494+
VPValue *CurVPV = CurRecipe->getVPSingleValue();
1495+
CurVPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
14821496
}
1497+
CurRecipe->eraseFromParent();
14831498
}
1484-
recursivelyDeleteDeadRecipes(HeaderMask);
14851499
}
1500+
recursivelyDeleteDeadRecipes(EVLMask);
1501+
14861502
// Replace all uses of VPCanonicalIVPHIRecipe by
14871503
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
14881504
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);

llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll

+19-24
Original file line numberDiff line numberDiff line change
@@ -130,54 +130,49 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
130130
; IF-EVL-OUTLOOP-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], [[TMP2]]
131131
; IF-EVL-OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
132132
; IF-EVL-OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
133-
; IF-EVL-OUTLOOP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1
134133
; IF-EVL-OUTLOOP-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
135134
; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul i32 [[TMP3]], 4
136-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0
137-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
138135
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
139136
; IF-EVL-OUTLOOP: vector.body:
140137
; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
141138
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
142-
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
139+
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
143140
; IF-EVL-OUTLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
144141
; IF-EVL-OUTLOOP-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[TMP5]], i32 4, i1 true)
145-
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[EVL_BASED_IV]], 0
146-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EVL_BASED_IV]], i64 0
147-
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
148-
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
149-
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> zeroinitializer, [[TMP8]]
150-
; IF-EVL-OUTLOOP-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
151-
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = icmp ule <vscale x 4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]]
152-
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP7]]
153-
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0
154-
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP12]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
155-
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = sext <vscale x 4 x i16> [[VP_OP_LOAD]] to <vscale x 4 x i32>
156-
; IF-EVL-OUTLOOP-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP13]]
157-
; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]]
142+
; IF-EVL-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i64 0
143+
; IF-EVL-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
144+
; IF-EVL-OUTLOOP-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
145+
; IF-EVL-OUTLOOP-NEXT: [[EVL_MASK:%.*]] = icmp ult <vscale x 4 x i32> [[TMP7]], [[DOTSPLAT]]
146+
; IF-EVL-OUTLOOP-NEXT: [[TMP8:%.*]] = add i32 [[EVL_BASED_IV]], 0
147+
; IF-EVL-OUTLOOP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP8]]
148+
; IF-EVL-OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[TMP9]], i32 0
149+
; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i16> @llvm.vp.load.nxv4i16.p0(ptr align 2 [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP6]])
150+
; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[VP_OP_LOAD]] to <vscale x 4 x i32>
151+
; IF-EVL-OUTLOOP-NEXT: [[TMP12]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP11]]
152+
; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[EVL_MASK]], <vscale x 4 x i32> [[TMP12]], <vscale x 4 x i32> [[VEC_PHI]]
158153
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i32 [[TMP6]], [[EVL_BASED_IV]]
159154
; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP4]]
160-
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
161-
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
155+
; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
156+
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
162157
; IF-EVL-OUTLOOP: middle.block:
163-
; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
158+
; IF-EVL-OUTLOOP-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
164159
; IF-EVL-OUTLOOP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
165160
; IF-EVL-OUTLOOP: scalar.ph:
166161
; IF-EVL-OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
167-
; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
162+
; IF-EVL-OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
168163
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_BODY:%.*]]
169164
; IF-EVL-OUTLOOP: for.body:
170165
; IF-EVL-OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
171166
; IF-EVL-OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
172167
; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]]
173-
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
174-
; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP18]] to i32
168+
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
169+
; IF-EVL-OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP16]] to i32
175170
; IF-EVL-OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]]
176171
; IF-EVL-OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
177172
; IF-EVL-OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
178173
; IF-EVL-OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
179174
; IF-EVL-OUTLOOP: for.cond.cleanup.loopexit:
180-
; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
175+
; IF-EVL-OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
181176
; IF-EVL-OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]]
182177
; IF-EVL-OUTLOOP: for.cond.cleanup:
183178
; IF-EVL-OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]

0 commit comments

Comments
 (0)