Skip to content

Commit

Permalink
[AArch64][SVE] Enable max vector bandwidth for SVE (llvm#109671)
Browse files Browse the repository at this point in the history
Returns true for shouldMaximizeVectorBandwidth when the register type
is a scalable vector and SVE or streaming SVE are available.
  • Loading branch information
huntergr-arm authored and EricWF committed Oct 22, 2024
1 parent e6ff862 commit 2dc4911
Show file tree
Hide file tree
Showing 8 changed files with 213 additions and 126 deletions.
6 changes: 4 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,10 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
TargetTransformInfo::RegisterKind K) const {
assert(K != TargetTransformInfo::RGK_Scalar);
return (K == TargetTransformInfo::RGK_FixedWidthVector &&
ST->isNeonAvailable());
return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
ST->isNeonAvailable()) ||
(K == TargetTransformInfo::RGK_ScalableVector &&
ST->isSVEorStreamingSVEAvailable()));
}

/// Calculate the cost of materializing a 64-bit value. This helper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -732,30 +732,60 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-LABEL: define void @multiple_exit_conditions(
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
; DEFAULT-NEXT: entry:
; DEFAULT-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DEFAULT-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 32
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; DEFAULT: vector.ph:
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
; DEFAULT-NEXT: [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
; DEFAULT-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 32
; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]]
; DEFAULT: vector.body:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
; DEFAULT-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; DEFAULT-NEXT: [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; DEFAULT-NEXT: [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; DEFAULT-NEXT: [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; DEFAULT-NEXT: [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
; DEFAULT-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
; DEFAULT-NEXT: [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
; DEFAULT-NEXT: [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; DEFAULT-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8
; DEFAULT-NEXT: [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16
; DEFAULT-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
; DEFAULT-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 24
; DEFAULT-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
; DEFAULT-NEXT: store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
; DEFAULT-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; DEFAULT: middle.block:
; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; DEFAULT: scalar.ph:
; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; DEFAULT-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; DEFAULT-NEXT: br label [[LOOP:%.*]]
; DEFAULT: vector.scevcheck:
; DEFAULT-NEXT: unreachable
Expand All @@ -780,7 +810,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PRED: vector.ph:
; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
; PRED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; PRED-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; PRED-NEXT: [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
; PRED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
Expand All @@ -789,31 +819,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; PRED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
; PRED-NEXT: [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
; PRED-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2
; PRED-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8
; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
; PRED-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8
; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]]
; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
; PRED-NEXT: br label [[VECTOR_BODY:%.*]]
; PRED: vector.body:
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
; PRED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; PRED-NEXT: [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
; PRED-NEXT: [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
; PRED-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
; PRED-NEXT: [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
; PRED-NEXT: [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
; PRED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; PRED: middle.block:
; PRED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
; REQUIRES: asserts
; RUN: opt -mtriple=aarch64 -mattr=+sve \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16

; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
; RUN: -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
; RUN: | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16

; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
Expand All @@ -29,7 +29,7 @@
; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).

; VF-4: <4 x i32>
; VF-VSCALE4: <16 x i32>
; VF-VSCALE16: <vscale x 16 x i32>
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
entry:
br label %loop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
; (maximized bandwidth for i8 in the loop).
define void @test0(ptr %a, ptr %b, ptr %c) #0 {
; CHECK: LV: Checking a loop in 'test0'
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
; CHECK_SCALABLE_ON: LV: Selecting VF: 16
; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
Expand Down
Loading

0 comments on commit 2dc4911

Please sign in to comment.