Skip to content

Commit cad286c

Browse files
committed
[AArch64] Set MaxInterleaving to 4 for Neoverse V2 and V3
1 parent edf56f1 commit cad286c

13 files changed

+176
-5
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

+8
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,10 @@ class TargetTransformInfo {
630630
AssumptionCache &AC, TargetLibraryInfo *LibInfo,
631631
HardwareLoopInfo &HWLoopInfo) const;
632632

633+
// Query the target for which minimum vectorization factor epilogue
634+
// vectorization should be considered.
635+
unsigned getEpilogueVectorizationMinVF() const;
636+
633637
/// Query the target whether it would be prefered to create a predicated
634638
/// vector loop, which can avoid the need to emit a scalar epilogue loop.
635639
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const;
@@ -1912,6 +1916,7 @@ class TargetTransformInfo::Concept {
19121916
AssumptionCache &AC,
19131917
TargetLibraryInfo *LibInfo,
19141918
HardwareLoopInfo &HWLoopInfo) = 0;
1919+
virtual unsigned getEpilogueVectorizationMinVF() = 0;
19151920
virtual bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) = 0;
19161921
virtual TailFoldingStyle
19171922
getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) = 0;
@@ -2392,6 +2397,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
23922397
HardwareLoopInfo &HWLoopInfo) override {
23932398
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
23942399
}
2400+
unsigned getEpilogueVectorizationMinVF() override {
2401+
return Impl.getEpilogueVectorizationMinVF();
2402+
}
23952403
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) override {
23962404
return Impl.preferPredicateOverEpilogue(TFI);
23972405
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

+2
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ class TargetTransformInfoImplBase {
199199
return false;
200200
}
201201

202+
unsigned getEpilogueVectorizationMinVF() const { return 16; }
203+
202204
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const { return false; }
203205

204206
TailFoldingStyle

llvm/include/llvm/CodeGen/BasicTTIImpl.h

+4
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
666666
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
667667
}
668668

669+
unsigned getEpilogueVectorizationMinVF() {
670+
return BaseT::getEpilogueVectorizationMinVF();
671+
}
672+
669673
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
670674
return BaseT::preferPredicateOverEpilogue(TFI);
671675
}

llvm/lib/Analysis/TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,10 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
359359
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
360360
}
361361

362+
unsigned TargetTransformInfo::getEpilogueVectorizationMinVF() const {
363+
return TTIImpl->getEpilogueVectorizationMinVF();
364+
}
365+
362366
bool TargetTransformInfo::preferPredicateOverEpilogue(
363367
TailFoldingInfo *TFI) const {
364368
return TTIImpl->preferPredicateOverEpilogue(TFI);

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -255,12 +255,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
255255
MaxBytesForLoopAlignment = 16;
256256
break;
257257
case NeoverseV2:
258-
// Specialize cost for Neoverse-V2.
258+
case NeoverseV3:
259+
EpilogueVectorizationMinVF = 8;
260+
MaxInterleaveFactor = 4;
259261
ScatterOverhead = 13;
260262
LLVM_FALLTHROUGH;
261263
case NeoverseN2:
262264
case NeoverseN3:
263-
case NeoverseV3:
264265
PrefFunctionAlignment = Align(16);
265266
PrefLoopAlignment = Align(32);
266267
MaxBytesForLoopAlignment = 16;

llvm/lib/Target/AArch64/AArch64Subtarget.h

+4
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
5656
bool ATTRIBUTE = DEFAULT;
5757
#include "AArch64GenSubtargetInfo.inc"
5858

59+
unsigned EpilogueVectorizationMinVF = 16;
5960
uint8_t MaxInterleaveFactor = 2;
6061
uint8_t VectorInsertExtractBaseCost = 2;
6162
uint16_t CacheLineSize = 0;
@@ -237,6 +238,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
237238
hasFuseAdrpAdd() || hasFuseLiterals();
238239
}
239240

241+
unsigned getEpilogueVectorizationMinVF() const {
242+
return EpilogueVectorizationMinVF;
243+
}
240244
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
241245
unsigned getVectorInsertExtractBaseCost() const;
242246
unsigned getCacheLineSize() const override { return CacheLineSize; }

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -4736,6 +4736,10 @@ static bool containsDecreasingPointers(Loop *TheLoop,
47364736
return false;
47374737
}
47384738

4739+
unsigned AArch64TTIImpl::getEpilogueVectorizationMinVF() const {
4740+
return ST->getEpilogueVectorizationMinVF();
4741+
}
4742+
47394743
bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
47404744
if (!ST->hasSVE())
47414745
return false;

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

+2
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
391391
return ST->useFixedOverScalableIfEqualCost();
392392
}
393393

394+
unsigned getEpilogueVectorizationMinVF() const;
395+
394396
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI);
395397

396398
bool supportsScalableVectors() const {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+5-2
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ static cl::opt<unsigned> EpilogueVectorizationForceVF(
186186
"loops."));
187187

188188
static cl::opt<unsigned> EpilogueVectorizationMinVF(
189-
"epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189+
"epilogue-vectorization-minimum-VF", cl::Hidden,
190190
cl::desc("Only loops with vectorization factor equal to or larger than "
191191
"the specified value are considered for epilogue vectorization."));
192192

@@ -4701,8 +4701,11 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
47014701
// See related "TODO: extend to support scalable VFs." in
47024702
// selectEpilogueVectorizationFactor.
47034703
unsigned Multiplier = VF.isFixed() ? IC : 1;
4704+
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4705+
? EpilogueVectorizationMinVF
4706+
: TTI.getEpilogueVectorizationMinVF();
47044707
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
4705-
EpilogueVectorizationMinVF;
4708+
MinVFThreshold;
47064709
}
47074710

47084711
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll

+8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
66
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
77
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
8+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
9+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
810

911
; Tests for selecting interleave counts for loops with loads and stores.
1012

@@ -213,6 +215,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8
213215
; INTERLEAVE-2: exit:
214216
; INTERLEAVE-2-NEXT: ret void
215217
;
218+
; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store(
219+
; INTERLEAVE-4-VLA: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
220+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
221+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
222+
; INTERLEAVE-4-VLA-NEXT: call <vscale x 16 x i8> @llvm.smax.nxv16i8(
223+
;
216224
entry:
217225
br label %loop
218226

llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll

+8
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
66
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
77
; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
8+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s
9+
; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s
810

911
; Tests for selecting the interleave count for loops with reductions.
1012

@@ -138,6 +140,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) {
138140
; INTERLEAVE-2-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
139141
; INTERLEAVE-2-NEXT: ret i32 [[RED_NEXT_LCSSA]]
140142
;
143+
; INTERLEAVE-4-VLA-LABEL: @interleave_integer_reduction(
144+
; INTERLEAVE-4-VLA: add <vscale x 4 x i32>
145+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
146+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
147+
; INTERLEAVE-4-VLA-NEXT: add <vscale x 4 x i32>
148+
;
141149
entry:
142150
br label %loop
143151

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
; RUN: opt -passes=loop-vectorize -S < %s | FileCheck %s
2+
3+
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define noundef i32 @V1(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #0 {
7+
; CHECK-LABEL: @V1(
8+
; CHECK-NOT: vec.epilog.ph:
9+
; CHECK-NOT: vec.epilog.vector.body:
10+
; CHECK-NOT: vec.epilog.middle.block:
11+
; CHECK-NOT: vec.epilog.scalar.ph:
12+
;
13+
entry:
14+
%4 = icmp sgt i32 %2, 0
15+
br i1 %4, label %5, label %8
16+
17+
5:
18+
%6 = zext nneg i32 %2 to i64
19+
br label %9
20+
21+
7:
22+
br label %8
23+
24+
8:
25+
ret i32 42
26+
27+
9:
28+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
29+
%11 = getelementptr inbounds double, ptr %0, i64 %10
30+
%12 = load double, ptr %11, align 8
31+
%13 = getelementptr inbounds double, ptr %1, i64 %10
32+
%14 = load double, ptr %13, align 8
33+
%15 = fadd fast double %14, %12
34+
store double %15, ptr %11, align 8
35+
%16 = add nuw nsw i64 %10, 1
36+
%17 = icmp eq i64 %16, %6
37+
br i1 %17, label %7, label %9
38+
}
39+
40+
define noundef i32 @V2(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #1 {
41+
;
42+
; CHECK-LABEL: @V2(
43+
; CHECK: vec.epilog.ph:
44+
; CHECK: vec.epilog.vector.body:
45+
; CHECK: vec.epilog.middle.block:
46+
; CHECK: vec.epilog.scalar.ph:
47+
;
48+
entry:
49+
%4 = icmp sgt i32 %2, 0
50+
br i1 %4, label %5, label %8
51+
52+
5:
53+
%6 = zext nneg i32 %2 to i64
54+
br label %9
55+
56+
7:
57+
br label %8
58+
59+
8:
60+
ret i32 42
61+
62+
9:
63+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
64+
%11 = getelementptr inbounds double, ptr %0, i64 %10
65+
%12 = load double, ptr %11, align 8
66+
%13 = getelementptr inbounds double, ptr %1, i64 %10
67+
%14 = load double, ptr %13, align 8
68+
%15 = fadd fast double %14, %12
69+
store double %15, ptr %11, align 8
70+
%16 = add nuw nsw i64 %10, 1
71+
%17 = icmp eq i64 %16, %6
72+
br i1 %17, label %7, label %9
73+
}
74+
75+
; TODO: The V3 will generate a scalable vector body, so doesn't need a
76+
; epilogue loop, but will need to be checked that is really the best thing to
77+
; for the V3.
78+
;
79+
define noundef i32 @V3(ptr noalias nocapture noundef %0, ptr noalias nocapture noundef readonly %1, i32 noundef %2) #2 {
80+
;
81+
; CHECK-LABEL: @V3(
82+
; CHECK-NOT: vec.epilog.ph:
83+
; CHECK-NOT: vec.epilog.vector.body:
84+
; CHECK-NOT: vec.epilog.middle.block:
85+
; CHECK-NOT: vec.epilog.scalar.ph:
86+
;
87+
entry:
88+
%4 = icmp sgt i32 %2, 0
89+
br i1 %4, label %5, label %8
90+
91+
5:
92+
%6 = zext nneg i32 %2 to i64
93+
br label %9
94+
95+
7:
96+
br label %8
97+
98+
8:
99+
ret i32 42
100+
101+
9:
102+
%10 = phi i64 [ 0, %5 ], [ %16, %9 ]
103+
%11 = getelementptr inbounds double, ptr %0, i64 %10
104+
%12 = load double, ptr %11, align 8
105+
%13 = getelementptr inbounds double, ptr %1, i64 %10
106+
%14 = load double, ptr %13, align 8
107+
%15 = fadd fast double %14, %12
108+
store double %15, ptr %11, align 8
109+
%16 = add nuw nsw i64 %10, 1
110+
%17 = icmp eq i64 %16, %6
111+
br i1 %17, label %7, label %9
112+
}
113+
114+
attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve2" }
115+
116+
attributes #1 = { vscale_range(1,16) "target-cpu"="neoverse-v2" "target-features"="+sve2" }
117+
118+
attributes #2 = { vscale_range(1,16) "target-cpu"="neoverse-v3" "target-features"="+sve2" }

llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-vscale-tune.ll

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
22
; RUN: -mcpu=neoverse-v1 -sve-tail-folding=disabled < %s | FileCheck %s --check-prefix=CHECK-EPILOG
33
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
4-
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
4+
; RUN: -mcpu=neoverse-v2 < %s | FileCheck %s --check-prefix=CHECK-EPILOG-V2
55
; RUN: opt -S -passes=loop-vectorize,instsimplify -force-vector-interleave=1 \
66
; RUN: -mcpu=cortex-x2 < %s | FileCheck %s --check-prefix=CHECK-NO-EPILOG
77

@@ -12,6 +12,11 @@ define void @foo(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i6
1212
; CHECK-EPILOG: vec.epilog.vector.body:
1313
; CHECK-EPILOG: load <vscale x 4 x i16>
1414

15+
; The epilogue loop gets vectorised vscale x 2 x i16 wide.
16+
; CHECK-EPILOG-V2: vec.epilog.ph:
17+
; CHECK-EPILOG-V2: vec.epilog.vector.body:
18+
; CHECK-EPILOG-V2: load <vscale x 2 x i16>
19+
1520
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.ph:
1621
; CHECK-NO-EPILOG-NOT: vec.epilog.vector.body:
1722
entry:

0 commit comments

Comments
 (0)