Skip to content

Commit 091a235

Browse files
authored
Revert "[AArch64][SVE] Enable max vector bandwidth for SVE" (#112873)
Reverts #109671 Reverting due to some performance regressions on neoverse-v1.
1 parent 6ce4b6d commit 091a235

File tree

8 files changed

+126
-213
lines changed

8 files changed

+126
-213
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -337,10 +337,8 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
337337
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
338338
TargetTransformInfo::RegisterKind K) const {
339339
assert(K != TargetTransformInfo::RGK_Scalar);
340-
return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
341-
ST->isNeonAvailable()) ||
342-
(K == TargetTransformInfo::RGK_ScalableVector &&
343-
ST->isSVEorStreamingSVEAvailable()));
340+
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
341+
ST->isNeonAvailable());
344342
}
345343

346344
/// Calculate the cost of materializing a 64-bit value. This helper

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 25 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -732,60 +732,30 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
732732
; DEFAULT-LABEL: define void @multiple_exit_conditions(
733733
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
734734
; DEFAULT-NEXT: entry:
735-
; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
736-
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32
737-
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
738-
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
735+
; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
739736
; DEFAULT: vector.ph:
740-
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
741-
; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
742-
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
743-
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
744-
; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
745-
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
746-
; DEFAULT-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
747-
; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
748-
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
737+
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
749738
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
750739
; DEFAULT: vector.body:
751740
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
752741
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
753742
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
754743
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
755744
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
756-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
757-
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
758-
; DEFAULT-NEXT: [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
759-
; DEFAULT-NEXT: [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
760-
; DEFAULT-NEXT: [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
761-
; DEFAULT-NEXT: [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
762-
; DEFAULT-NEXT: [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
763-
; DEFAULT-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
764-
; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
765-
; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
745+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
746+
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
747+
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
748+
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
766749
; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
767-
; DEFAULT-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
768-
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
769-
; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
770-
; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
771-
; DEFAULT-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
772-
; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
773-
; DEFAULT-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
774-
; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24
775-
; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
776-
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
777-
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
778-
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
779-
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
780-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
781-
; DEFAULT-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
782-
; DEFAULT-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
750+
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
751+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
752+
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
753+
; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
783754
; DEFAULT: middle.block:
784-
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
785-
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
755+
; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
786756
; DEFAULT: scalar.ph:
787757
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
788-
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
758+
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
789759
; DEFAULT-NEXT: br label [[LOOP:%.*]]
790760
; DEFAULT: vector.scevcheck:
791761
; DEFAULT-NEXT: unreachable
@@ -810,7 +780,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
810780
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
811781
; PRED: vector.ph:
812782
; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
813-
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
783+
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
814784
; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
815785
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
816786
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -819,31 +789,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
819789
; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
820790
; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
821791
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
822-
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
792+
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
823793
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
824-
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
794+
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
825795
; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]]
826796
; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
827797
; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
828-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
798+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
829799
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
830800
; PRED: vector.body:
831801
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
832-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
802+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
833803
; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
834804
; PRED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
835805
; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
836806
; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
837-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
838-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
839-
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
840-
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
807+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
808+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
809+
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
810+
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
841811
; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
842-
; PRED-NEXT: call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
812+
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
843813
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
844-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
845-
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
846-
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
814+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
815+
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
816+
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
847817
; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
848818
; PRED: middle.block:
849819
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
; REQUIRES: asserts
22
; RUN: opt -mtriple=aarch64 -mattr=+sve \
33
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
4-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
4+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
55

66
; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
77
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
8-
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
8+
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
99

1010
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
1111
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
12-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
12+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
1313

1414
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1515
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
16-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
16+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
1717

1818
; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
1919
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
20-
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
20+
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
2121

2222
; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
2323
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
@@ -29,7 +29,7 @@
2929
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
3030

3131
; VF-4: <4 x i32>
32-
; VF-VSCALE16: <vscale x 16 x i32>
32+
; VF-VSCALE4: <16 x i32>
3333
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
3434
entry:
3535
br label %loop

llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
; (maximized bandwidth for i8 in the loop).
99
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
1010
; CHECK: LV: Checking a loop in 'test0'
11-
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
12-
; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
11+
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
12+
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
1313
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
1414
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
1515
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16

0 commit comments

Comments
 (0)