Skip to content

Commit 4310076

Browse files
authored
LV: generalize profitability criterion over TC (#93300)
Generalize LoopVectorizationPlanner::isMoreProfitable smoothly across the fixed-vector and scalable-vector cases, taking the trip-count into account, and fixing logical pitfalls that arise from a lack of generality.
1 parent 4310988 commit 4310076

File tree

4 files changed

+111
-105
lines changed

4 files changed

+111
-105
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4858,28 +4858,6 @@ bool LoopVectorizationPlanner::isMoreProfitable(
48584858

48594859
unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
48604860

4861-
if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
4862-
// If the trip count is a known (possibly small) constant, the trip count
4863-
// will be rounded up to an integer number of iterations under
4864-
// FoldTailByMasking. The total cost in that case will be
4865-
// VecCost*ceil(TripCount/VF). When not folding the tail, the total
4866-
// cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4867-
// some extra overheads, but for the purpose of comparing the costs of
4868-
// different VFs we can use this to compare the total loop-body cost
4869-
// expected after vectorization.
4870-
auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4871-
InstructionCost VectorCost,
4872-
InstructionCost ScalarCost) {
4873-
return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
4874-
: VectorCost * (MaxTripCount / VF) +
4875-
ScalarCost * (MaxTripCount % VF);
4876-
};
4877-
auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
4878-
auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
4879-
4880-
return RTCostA < RTCostB;
4881-
}
4882-
48834861
// Improve estimate for the vector width if it is scalable.
48844862
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
48854863
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
@@ -4893,13 +4871,37 @@ bool LoopVectorizationPlanner::isMoreProfitable(
48934871
// Assume vscale may be larger than 1 (or the value being tuned for),
48944872
// so that scalable vectorization is slightly favorable over fixed-width
48954873
// vectorization.
4896-
if (A.Width.isScalable() && !B.Width.isScalable())
4897-
return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
4874+
bool PreferScalable = A.Width.isScalable() && !B.Width.isScalable();
4875+
auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4876+
const InstructionCost &RHS) {
4877+
return PreferScalable ? LHS <= RHS : LHS < RHS;
4878+
};
48984879

48994880
// To avoid the need for FP division:
4900-
// (CostA / A.Width) < (CostB / B.Width)
4901-
// <=> (CostA * B.Width) < (CostB * A.Width)
4902-
return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
4881+
// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4882+
// <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4883+
if (!MaxTripCount)
4884+
return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4885+
4886+
auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4887+
InstructionCost VectorCost,
4888+
InstructionCost ScalarCost) {
4889+
// If the trip count is a known (possibly small) constant, the trip count
4890+
// will be rounded up to an integer number of iterations under
4891+
// FoldTailByMasking. The total cost in that case will be
4892+
// VecCost*ceil(TripCount/VF). When not folding the tail, the total
4893+
// cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4894+
// some extra overheads, but for the purpose of comparing the costs of
4895+
// different VFs we can use this to compare the total loop-body cost
4896+
// expected after vectorization.
4897+
if (CM.foldTailByMasking())
4898+
return VectorCost * divideCeil(MaxTripCount, VF);
4899+
return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4900+
};
4901+
4902+
auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4903+
auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4904+
return CmpFn(RTCostA, RTCostB);
49034905
}
49044906

49054907
static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,

llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -8,44 +8,46 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1
88
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
99
; CHECK: vector.ph:
1010
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
11-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
11+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
1212
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
13-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
13+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
1414
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
1515
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 8, [[TMP4]]
1616
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
1717
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
1818
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
1919
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
20-
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
21-
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8)
22-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
23-
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
24-
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
25-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
20+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
21+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 8)
22+
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
23+
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
24+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
25+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
2626
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
27-
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
27+
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
2828
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP11]]
29-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
30-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
31-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
32-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
29+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
30+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
31+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[VAL]], i64 0
32+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
3333
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3434
; CHECK: vector.body:
3535
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
36-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
37-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
36+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
37+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
3838
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
3939
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
40-
; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
41-
; CHECK-NEXT: [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
42-
; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
40+
; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
41+
; CHECK-NEXT: [[TMP15:%.*]] = lshr <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
42+
; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 4 x i64> [[TMP15]] to <vscale x 4 x i8>
4343
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
44-
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
44+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
4545
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
46-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
47-
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 8)
48-
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
46+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 8)
47+
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
48+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
49+
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
50+
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
4951
; CHECK: middle.block:
5052
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
5153
; CHECK: scalar.ph:
@@ -99,44 +101,46 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range
99101
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
100102
; CHECK: vector.ph:
101103
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
102-
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
104+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
103105
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
104-
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
106+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
105107
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
106108
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], [[TMP4]]
107109
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
108110
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
109111
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[N_VEC]]
110112
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
111-
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
112-
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
113-
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.experimental.stepvector.nxv8i64()
114-
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 8 x i64> [[TMP7]], zeroinitializer
115-
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 8 x i64> [[TMP8]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
116-
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 8 x i64> zeroinitializer, [[TMP9]]
113+
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
114+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
115+
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
116+
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
117+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
118+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
117119
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
118-
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
120+
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
119121
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP11]]
120-
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP12]], i64 0
121-
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[DOTSPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
122-
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[VAL]], i64 0
123-
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
122+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
123+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
124+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[VAL]], i64 0
125+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
124126
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
125127
; CHECK: vector.body:
126128
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
127-
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
128-
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 8 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
129+
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
130+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
129131
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
130132
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP13]]
131-
; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw <vscale x 8 x i64> [[VEC_IND]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 3, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
132-
; CHECK-NEXT: [[TMP15:%.*]] = lshr <vscale x 8 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
133-
; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 8 x i64> [[TMP15]] to <vscale x 8 x i8>
133+
; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
134+
; CHECK-NEXT: [[TMP15:%.*]] = lshr <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP14]]
135+
; CHECK-NEXT: [[TMP16:%.*]] = trunc <vscale x 4 x i64> [[TMP15]] to <vscale x 4 x i8>
134136
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0
135-
; CHECK-NEXT: call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
137+
; CHECK-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP16]], ptr [[TMP17]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
136138
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
137-
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 8 x i64> [[VEC_IND]], [[DOTSPLAT]]
138-
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
139-
; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
139+
; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
140+
; CHECK-NEXT: [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
141+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
142+
; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
143+
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
140144
; CHECK: middle.block:
141145
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
142146
; CHECK: scalar.ph:

0 commit comments

Comments
 (0)