Skip to content

Commit 9bccf61

Browse files
authored
[AArch64][LV] Set MaxInterleaving to 4 for Neoverse V2 and V3 (llvm#100385)
Set the maximum interleaving factor to 4, aligning with the number of available SIMD pipelines. This increases the number of vector instructions in the vectorised loop body, enhancing performance during its execution. However, for very low iteration counts, the vectorised body might not execute at all, leaving only the epilogue loop to run. This issue affects e.g. cam4_r from SPEC FP, which experienced a performance regression. To address this, the patch reduces the minimum epilogue vectorisation factor from 16 to 8, enabling the epilogue to be vectorised and largely mitigating the regression.
1 parent 2b5214b commit 9bccf61

13 files changed

+176
-6
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

+8
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,10 @@ class TargetTransformInfo {
630630
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
631631
HardwareLoopInfo &HWLoopInfo) const;
632632

633+
// Query the target for which minimum vectorization factor epilogue
634+
// vectorization should be considered.
635+
unsigned getEpilogueVectorizationMinVF() const;
636+
633637
/// Query the target whether it would be prefered to create a predicated
634638
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
635639
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
19121916
AssumptionCache &AC,
19131917
TargetLibraryInfo *LibInfo,
19141918
HardwareLoopInfo &HWLoopInfo) = 0;
1919+
virtual unsigned getEpilogueVectorizationMinVF() = 0;
19151920
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
19161921
virtual TailFoldingStyle
19171922
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
23922397
HardwareLoopInfo &HWLoopInfo) override {
23932398
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
23942399
}
2400+
unsigned getEpilogueVectorizationMinVF() override {
2401+
return Impl.getEpilogueVectorizationMinVF();
2402+
}
23952403
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
23962404
return Impl.preferPredicateOverEpilogue(TFI);
23972405
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+2
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
199199
return false;
200200
}
201201

202+
unsigned getEpilogueVectorizationMinVF() const { return 16; }
203+
202204
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
203205

204206
TailFoldingStyle

llvm/include/llvm/CodeGen/BasicTTIImpl.h

+4
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
666666
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
667667
}
668668

669+
unsigned getEpilogueVectorizationMinVF() {
670+
return BaseT::getEpilogueVectorizationMinVF();
671+
}
672+
669673
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
670674
return BaseT::preferPredicateOverEpilogue(TFI);
671675
}

llvm/lib/Analysis/TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
359359
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
360360
}
361361

362+
unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
363+
return TTIImpl->getEpilogueVectorizationMinVF();
364+
}
365+
362366
bool TargetTransformInfo::preferPredicateOverEpilogue(
363367
TailFoldingInfo *TFI) const {
364368
return TTIImpl->preferPredicateOverEpilogue(TFI);

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
255255
MaxBytesForLoopAlignment = 16;
256256
break;
257257
case NeoverseV2:
258-
// Specialize cost for Neoverse-V2.
258+
case NeoverseV3:
259+
EpilogueVectorizationMinVF = 8;
260+
MaxInterleaveFactor = 4;
259261
ScatterOverhead = 13;
260262
LLVM_FALLTHROUGH;
261263
case NeoverseN2:
262264
case NeoverseN3:
263-
case NeoverseV3:
264265
PrefFunctionAlignment = Align(16);
265266
PrefLoopAlignment = Align(32);
266267
MaxBytesForLoopAlignment = 16;

llvm/lib/Target/AArch64/AArch64Subtarget.h

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
5656
bool ATTRIBUTE = DEFAULT;
5757
#include "AArch64GenSubtargetInfo.inc"
5858

59+
unsigned EpilogueVectorizationMinVF = 16;
5960
uint8_t MaxInterleaveFactor = 2;
6061
uint8_t VectorInsertExtractBaseCost = 2;
6162
uint16_t CacheLineSize = 0;
@@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
237238
hasFuseAdrpAdd() || hasFuseLiterals();
238239
}
239240

241+
unsigned getEpilogueVectorizationMinVF() const {
242+
return EpilogueVectorizationMinVF;
243+
}
240244
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
241245
unsigned getVectorInsertExtractBaseCost() const;
242246
unsigned getCacheLineSize() const override { return CacheLineSize; }

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
47364736
return false;
47374737
}
47384738

4739+
unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
4740+
return ST->getEpilogueVectorizationMinVF();
4741+
}
4742+
47394743
bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
47404744
if (!ST->hasSVE())
47414745
return false;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
391391
return ST->useFixedOverScalableIfEqualCost();
392392
}
393393

394+
unsigned getEpilogueVectorizationMinVF() const;
395+
394396
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
395397

396398
bool supportsScalableVectors() const {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
186186
"loops."));
187187

188188
static cl::opt<unsigned> EpilogueVectorizationMinVF(
189-
"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189+
"epilogue-vectorization-minimum-VF", cl::Hidden,
190190
cl::desc("Only loops with vectorization factor equal to or larger than "
191191
"the specified value are considered for epilogue vectorization."));
192192

@@ -4701,8 +4701,10 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
47014701
// See related "TODO: extend to support scalable VFs." in
47024702
// selectEpilogueVectorizationFactor.
47034703
unsigned Multiplier = VF.isFixed() ? IC : 1;
4704-
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
4705-
EpilogueVectorizationMinVF;
4704+
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4705+
? EpilogueVectorizationMinVF
4706+
: TTI.getEpilogueVectorizationMinVF();
4707+
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
47064708
}
47074709

47084710
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll

+8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
66
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
77
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
8+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
9+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
810

911
; Tests for selecting interleave counts for loops with loads and stores.
1012

@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
213215
; INTERLEAVE-2: exit:
214216
; INTERLEAVE-2-NEXT: ret void
215217
;
218+
; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
219+
; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
220+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
221+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
222+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
223+
;
216224
entry:
217225
br label %loop
218226

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll

+8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
66
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
77
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
8+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
9+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
810

911
; Tests for selecting the interleave count for loops with reductions.
1012

@@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
138140
; INTERLEAVE-2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
139141
; INTERLEAVE-2-NEXT: ret i32 [[RED_NEXT_LCSSA]]
140142
;
143+
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
144+
; INTERLEAVE-4-VLA: add <vscale x 4 x i32>
145+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
146+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
147+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
148+
;
141149
entry:
142150
br label %loop
143151

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
2+
3+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
7+
; CHECK-LABEL: @V1(
8+
; CHECK-NOT: vec.epilog.ph:
9+
; CHECK-NOT: vec.epilog.vector.body:
10+
; CHECK-NOT: vec.epilog.middle.block:
11+
; CHECK-NOT: vec.epilog.scalar.ph:
12+
;
13+
entry:
14+
%4 = icmp sgt i32 %2, 0
15+
br i1 %4, label %5, label %8
16+
17+
5:
18+
%6 = zext nneg i32 %2 to i64
19+
br label %9
20+
21+
7:
22+
br label %8
23+
24+
8:
25+
ret i32 42
26+
27+
9:
28+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
29+
%11 = getelementptr inbounds double, ptr %0, i64 %10
30+
%12 = load double, ptr %11, align 8
31+
%13 = getelementptr inbounds double, ptr %1, i64 %10
32+
%14 = load double, ptr %13, align 8
33+
%15 = fadd fast double %14, %12
34+
store double %15, ptr %11, align 8
35+
%16 = add nuw nsw i64 %10, 1
36+
%17 = icmp eq i64 %16, %6
37+
br i1 %17, label %7, label %9
38+
}
39+
40+
define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
41+
;
42+
; CHECK-LABEL: @V2(
43+
; CHECK: vec.epilog.ph:
44+
; CHECK: vec.epilog.vector.body:
45+
; CHECK: vec.epilog.middle.block:
46+
; CHECK: vec.epilog.scalar.ph:
47+
;
48+
entry:
49+
%4 = icmp sgt i32 %2, 0
50+
br i1 %4, label %5, label %8
51+
52+
5:
53+
%6 = zext nneg i32 %2 to i64
54+
br label %9
55+
56+
7:
57+
br label %8
58+
59+
8:
60+
ret i32 42
61+
62+
9:
63+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
64+
%11 = getelementptr inbounds double, ptr %0, i64 %10
65+
%12 = load double, ptr %11, align 8
66+
%13 = getelementptr inbounds double, ptr %1, i64 %10
67+
%14 = load double, ptr %13, align 8
68+
%15 = fadd fast double %14, %12
69+
store double %15, ptr %11, align 8
70+
%16 = add nuw nsw i64 %10, 1
71+
%17 = icmp eq i64 %16, %6
72+
br i1 %17, label %7, label %9
73+
}
74+
75+
; TODO: The V3 will generate a scalable vector body, so doesn't need a
76+
; epilogue loop, but will need to be checked that is really the best thing to
77+
; for the V3.
78+
;
79+
define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
80+
;
81+
; CHECK-LABEL: @V3(
82+
; CHECK-NOT: vec.epilog.ph:
83+
; CHECK-NOT: vec.epilog.vector.body:
84+
; CHECK-NOT: vec.epilog.middle.block:
85+
; CHECK-NOT: vec.epilog.scalar.ph:
86+
;
87+
entry:
88+
%4 = icmp sgt i32 %2, 0
89+
br i1 %4, label %5, label %8
90+
91+
5:
92+
%6 = zext nneg i32 %2 to i64
93+
br label %9
94+
95+
7:
96+
br label %8
97+
98+
8:
99+
ret i32 42
100+
101+
9:
102+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
103+
%11 = getelementptr inbounds double, ptr %0, i64 %10
104+
%12 = load double, ptr %11, align 8
105+
%13 = getelementptr inbounds double, ptr %1, i64 %10
106+
%14 = load double, ptr %13, align 8
107+
%15 = fadd fast double %14, %12
108+
store double %15, ptr %11, align 8
109+
%16 = add nuw nsw i64 %10, 1
110+
%17 = icmp eq i64 %16, %6
111+
br i1 %17, label %7, label %9
112+
}
113+
114+
attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }
115+
116+
attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }
117+
118+
attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
22
; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
33
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
4-
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
4+
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2
55
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
66
; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
77

@@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6
1212
; CHECK-EPILOG: vec.epilog.vector.body:
1313
; CHECK-EPILOG: load <vscale x 4 x i16>
1414

15+
; The epilogue loop gets vectorised vscale x 2 x i16 wide.
16+
; CHECK-EPILOG-V2: vec.epilog.ph:
17+
; CHECK-EPILOG-V2: vec.epilog.vector.body:
18+
; CHECK-EPILOG-V2: load <vscale x 2 x i16>
19+
1520
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph:
1621
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body:
1722
entry:

0 commit comments

Comments
 (0)