Skip to content

[AArch64] Set a RealUse for getVectorInstrCost with scalar uses. #138811

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3895,8 +3895,9 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
Scalar, ScalarUserAndIdx);
return getVectorInstrCostHelper(Opcode, Val, CostKind, Index,
/*HasRealUse=*/true, nullptr, Scalar,
ScalarUserAndIdx);
}

InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
Expand Down
27 changes: 13 additions & 14 deletions llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-20 -slp-vectorize-hor=0 < %s | FileCheck %s
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-40 -slp-vectorize-hor=0 < %s | FileCheck %s

define i16 @foo(i16 %in1, i16 %in2) {
; CHECK-LABEL: define i16 @foo(
; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535)
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533)
; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP2]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP6]], splat (i64 196605)
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
; CHECK-NEXT: [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
; CHECK-NEXT: [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i64> [[TMP9]], splat (i64 65533)
; CHECK-NEXT: [[CMP2_1:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
; CHECK-NEXT: [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT4_1]], [[ZEXT3_1]]
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
; CHECK-NEXT: [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605
; CHECK-NEXT: [[CMP2_2:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
; CHECK-NEXT: [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD2]], [[ZEXT3_2]]
Expand Down
30 changes: 20 additions & 10 deletions llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-9 -pass-remarks-output=%t < %s | FileCheck %s
; RUN: cat %t | FileCheck -check-prefix=YAML %s

; These tests check that we remove from consideration pairs of seed
Expand All @@ -20,13 +18,23 @@
;

; YAML-LABEL: Function: getelementptr_4x32
; YAML: --- !Passed
; YAML-NEXT: Pass: slp-vectorizer
; YAML-NEXT: Name: VectorizedHorizontalReduction
; YAML-NEXT: Function: getelementptr_4x32
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
; YAML-NEXT: - Cost: '7'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '1'

; YAML: --- !Passed
; YAML-NEXT: Pass: slp-vectorizer
; YAML-NEXT: Name: VectorizedList
; YAML-NEXT: Function: getelementptr_4x32
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '4'
; YAML-NEXT: - Cost: '6'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '3'

Expand All @@ -36,7 +44,7 @@
; YAML-NEXT: Function: getelementptr_4x32
; YAML-NEXT: Args:
; YAML-NEXT: - String: 'SLP vectorized with cost '
; YAML-NEXT: - Cost: '6'
; YAML-NEXT: - Cost: '8'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '3'

Expand Down Expand Up @@ -66,23 +74,25 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
; CHECK-NEXT: [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP7]]
; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1
; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]]
; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]]
; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]]
; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[T6]], i64 0
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[T8]], i64 1
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[T10]], i64 2
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[T12]], i64 3
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]])
; CHECK-NEXT: [[ADD16]] = add i32 [[TMP19]], [[SUM_032]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
Expand Down Expand Up @@ -133,7 +143,7 @@ for.body:
; YAML: Function: getelementptr_2x32
; YAML: Args:
; YAML: - String: 'SLP vectorized with cost '
; YAML: - Cost: '4'
; YAML: - Cost: '6'
; YAML-NEXT: - String: ' and with tree size '
; YAML-NEXT: - TreeSize: '3'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -597,18 +597,30 @@ bb15: ; preds = %bb15, %bb14

; Some points we collected as candidates for runtime checks have been removed
; before generating runtime checks. Make sure versioning is skipped.
define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c, float %f, float %g) {
; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
; CHECK-NEXT: entry:
; CHECK-NEXT: store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[F:%.*]], 2.000000e+01
; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[G:%.*]], 2.000000e+01
; CHECK-NEXT: [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
; CHECK-NEXT: [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
; CHECK-NEXT: [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
; CHECK-NEXT: store i32 [[TMP7]], ptr [[A:%.*]], align 8
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP12]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
; CHECK-NEXT: br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
; CHECK: bb14:
; CHECK-NEXT: [[TMP15:%.*]] = sext i32 10 to i64
; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
; CHECK-NEXT: store float 0.000000e+00, ptr [[TMP20]], align 8
; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
Expand All @@ -618,9 +630,9 @@ define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
; CHECK-NEXT: ret void
;
entry:
%tmp1 = fmul float 10.0, 20.0
%tmp1 = fmul float %f, 20.0
%tmp2 = fptosi float %tmp1 to i32
%tmp3 = fmul float 30.0, 20.0
%tmp3 = fmul float %g, 20.0
%tmp4 = fptosi float %tmp3 to i32
%tmp5 = icmp sgt i32 100, %tmp2
%tmp6 = select i1 %tmp5, i32 %tmp2, i32 10
Expand Down
Loading