-
Notifications
You must be signed in to change notification settings - Fork 14k
[AArch64] Consider runtime mode when deciding to use SVE for fixed-length vectors. #96081
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64] Consider runtime mode when deciding to use SVE for fixed-length vectors. #96081
Conversation
…ngth vectors. This also fixes the case where an SVE div is incorrectly to be assumed available in non-streaming mode with SME.
@llvm/pr-subscribers-backend-aarch64 Author: Sander de Smalen (sdesmalen-arm) ChangesThis also fixes the case where an SVE div is incorrectly to be assumed available in non-streaming mode with SME. Patch is 65.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96081.diff 56 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c790209cc221f..a4fa25ffdd6ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1418,7 +1418,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- if (Subtarget->hasSVEorSME()) {
+ if (Subtarget->isSVEorStreamingSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);
@@ -1528,14 +1528,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
- // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
- for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
- MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
- MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ // NEON doesn't support masked loads/stores, but SME and SVE do.
+ for (auto VT :
+ {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+ MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
+
+ // NEON doesn't support masked gathers/scatters, but SVE does.
+ if (Subtarget->isSVEAvailable()) {
+ for (auto VT :
+ {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+ MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+ MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+ }
}
// Firstly, exclude all scalable vector extending loads/truncating stores,
@@ -6986,7 +6996,7 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
// NEON-sized vectors can be emulated using SVE instructions.
if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
- return Subtarget->hasSVEorSME();
+ return Subtarget->isSVEorStreamingSVEAvailable();
// Ensure NEON MVTs only belong to a single register class.
if (VT.getFixedSizeInBits() <= 128)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 7ef7a89b5749f..5e1a370778914 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -185,6 +185,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
(hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
}
+ /// Returns true if the target has access to either the full range of SVE instructions,
+ /// or the streaming-compatible subset of SVE instructions.
+ bool isSVEorStreamingSVEAvailable() const {
+ return hasSVE() || hasSMEFA64() || (hasSME() && isStreaming());
+ }
+
unsigned getMinVectorRegisterBitWidth() const {
// Don't assume any minimum vector size when PSTATE.SM may not be 0, because
// we don't yet support streaming-compatible codegen support that we trust
@@ -374,11 +380,11 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
}
bool useSVEForFixedLengthVectors() const {
- if (!isNeonAvailable())
- return hasSVEorSME();
+ if (!isSVEorStreamingSVEAvailable())
+ return false;
// Prefer NEON unless larger SVE registers are available.
- return hasSVEorSME() && getMinSVEVectorSizeInBits() >= 256;
+ return !isNeonAvailable() || getMinSVEVectorSizeInBits() >= 256;
}
bool useSVEForFixedLengthVectors(EVT VT) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 7de9071476e7f..f94fa037a42c4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
"with zero meaning no minimum size is assumed."),
cl::init(0), cl::Hidden);
+static cl::opt<bool> ForceStreaming(
+ "force-streaming",
+ cl::desc("Force the use of streaming code for all functions"),
+ cl::init(false), cl::Hidden);
+
static cl::opt<bool> ForceStreamingCompatible(
"force-streaming-compatible",
cl::desc("Force the use of streaming-compatible code for all functions"),
@@ -412,7 +417,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
bool HasMinSize = F.hasMinSize();
- bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
+ bool IsStreaming = ForceStreaming ||
+ F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
F.hasFnAttribute("aarch64_pstate_sm_body");
bool IsStreamingCompatible =
F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9c72afd84fa7c..cdf2a962f9322 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,NONSTREAMING
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,STREAMING
; WITH VSCALE RANGE
@@ -362,145 +362,261 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
; FIXED-WIDTH VECTOR TYPES
define i32 @ctz_v16i1(<16 x i1> %a) {
-; CHECK-LABEL: ctz_v16i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.b
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v16i1:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT: ptrue p0.b, vl16
+; NONSTREAMING-NEXT: ptrue p1.b
+; NONSTREAMING-NEXT: cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.b
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v16i1:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT: ptrue p0.b, vl16
+; STREAMING-NEXT: lsl z0.b, z0.b, #7
+; STREAMING-NEXT: ptrue p1.b
+; STREAMING-NEXT: asr z0.b, z0.b, #7
+; STREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.b
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
ret i32 %res
}
define i32 @ctz_v16i1_poison(<16 x i1> %a) {
-; CHECK-LABEL: ctz_v16i1_poison:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.b
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v16i1_poison:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT: ptrue p0.b, vl16
+; NONSTREAMING-NEXT: ptrue p1.b
+; NONSTREAMING-NEXT: cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.b
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v16i1_poison:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT: ptrue p0.b, vl16
+; STREAMING-NEXT: lsl z0.b, z0.b, #7
+; STREAMING-NEXT: ptrue p1.b
+; STREAMING-NEXT: asr z0.b, z0.b, #7
+; STREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.b
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
ret i32 %res
}
define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
-; CHECK-LABEL: add_i64_ctz_v16i1_poison:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.16b, v0.16b, #7
-; CHECK-NEXT: ptrue p0.b, vl16
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: incp x0, p0.b
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: add_i64_ctz_v16i1_poison:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT: ptrue p0.b, vl16
+; NONSTREAMING-NEXT: ptrue p1.b
+; NONSTREAMING-NEXT: cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: incp x0, p0.b
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: add_i64_ctz_v16i1_poison:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT: ptrue p0.b, vl16
+; STREAMING-NEXT: lsl z0.b, z0.b, #7
+; STREAMING-NEXT: ptrue p1.b
+; STREAMING-NEXT: asr z0.b, z0.b, #7
+; STREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: incp x0, p0.b
+; STREAMING-NEXT: ret
%res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
%add = add i64 %res, %b
ret i64 %add
}
define i32 @ctz_v8i1(<8 x i1> %a) {
-; CHECK-LABEL: ctz_v8i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.8b, v0.8b, #7
-; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.b
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v8i1:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.8b, v0.8b, #7
+; NONSTREAMING-NEXT: ptrue p0.b, vl8
+; NONSTREAMING-NEXT: ptrue p1.b
+; NONSTREAMING-NEXT: cmlt v0.8b, v0.8b, #0
+; NONSTREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.b
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v8i1:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.b, vl8
+; STREAMING-NEXT: lsl z0.b, z0.b, #7
+; STREAMING-NEXT: ptrue p1.b
+; STREAMING-NEXT: asr z0.b, z0.b, #7
+; STREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.b
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
ret i32 %res
}
define i32 @ctz_v8i1_poison(<8 x i1> %a) {
-; CHECK-LABEL: ctz_v8i1_poison:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.8b, v0.8b, #7
-; CHECK-NEXT: ptrue p0.b, vl8
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.b
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v8i1_poison:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.8b, v0.8b, #7
+; NONSTREAMING-NEXT: ptrue p0.b, vl8
+; NONSTREAMING-NEXT: ptrue p1.b
+; NONSTREAMING-NEXT: cmlt v0.8b, v0.8b, #0
+; NONSTREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.b
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v8i1_poison:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.b, vl8
+; STREAMING-NEXT: lsl z0.b, z0.b, #7
+; STREAMING-NEXT: ptrue p1.b
+; STREAMING-NEXT: asr z0.b, z0.b, #7
+; STREAMING-NEXT: cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.b
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
ret i32 %res
}
define i32 @ctz_v4i1(<4 x i1> %a) {
-; CHECK-LABEL: ctz_v4i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.4h, v0.4h, #15
-; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.h
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v4i1:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.4h, v0.4h, #15
+; NONSTREAMING-NEXT: ptrue p0.h, vl4
+; NONSTREAMING-NEXT: ptrue p1.h
+; NONSTREAMING-NEXT: cmlt v0.4h, v0.4h, #0
+; NONSTREAMING-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.h
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v4i1:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.h, vl4
+; STREAMING-NEXT: lsl z0.h, z0.h, #15
+; STREAMING-NEXT: ptrue p1.h
+; STREAMING-NEXT: asr z0.h, z0.h, #15
+; STREAMING-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.h
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
ret i32 %res
}
define i32 @ctz_v4i1_poison(<4 x i1> %a) {
-; CHECK-LABEL: ctz_v4i1_poison:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.4h, v0.4h, #15
-; CHECK-NEXT: ptrue p0.h, vl4
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.h
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v4i1_poison:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.4h, v0.4h, #15
+; NONSTREAMING-NEXT: ptrue p0.h, vl4
+; NONSTREAMING-NEXT: ptrue p1.h
+; NONSTREAMING-NEXT: cmlt v0.4h, v0.4h, #0
+; NONSTREAMING-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.h
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v4i1_poison:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.h, vl4
+; STREAMING-NEXT: lsl z0.h, z0.h, #15
+; STREAMING-NEXT: ptrue p1.h
+; STREAMING-NEXT: asr z0.h, z0.h, #15
+; STREAMING-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.h
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
ret i32 %res
}
define i32 @ctz_v2i1(<2 x i1> %a) {
-; CHECK-LABEL: ctz_v2i1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.2s, v0.2s, #31
-; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.s
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v2i1:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.2s, v0.2s, #31
+; NONSTREAMING-NEXT: ptrue p0.s, vl2
+; NONSTREAMING-NEXT: ptrue p1.s
+; NONSTREAMING-NEXT: cmlt v0.2s, v0.2s, #0
+; NONSTREAMING-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.s
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v2i1:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.s, vl2
+; STREAMING-NEXT: lsl z0.s, z0.s, #31
+; STREAMING-NEXT: ptrue p1.s
+; STREAMING-NEXT: asr z0.s, z0.s, #31
+; STREAMING-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.s
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
ret i32 %res
}
define i32 @ctz_v2i1_poison(<2 x i1> %a) {
-; CHECK-LABEL: ctz_v2i1_poison:
-; CHECK: // %bb.0:
-; CHECK-NEXT: shl v0.2s, v0.2s, #31
-; CHECK-NEXT: ptrue p0.s, vl2
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT: brkb p0.b, p1/z, p0.b
-; CHECK-NEXT: cntp x0, p0, p0.s
-; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT: ret
+; NONSTREAMING-LABEL: ctz_v2i1_poison:
+; NONSTREAMING: // %bb.0:
+; NONSTREAMING-NEXT: shl v0.2s, v0.2s, #31
+; NONSTREAMING-NEXT: ptrue p0.s, vl2
+; NONSTREAMING-NEXT: ptrue p1.s
+; NONSTREAMING-NEXT: cmlt v0.2s, v0.2s, #0
+; NONSTREAMING-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; NONSTREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT: cntp x0, p0, p0.s
+; NONSTREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT: ret
+;
+; STREAMING-LABEL: ctz_v2i1_poison:
+; STREAMING: // %bb.0:
+; STREAMING-NEXT: // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT: ptrue p0.s, vl2
+; STREAMING-NEXT: lsl z0.s, z0.s, #31
+; STREAMING-NEXT: ptrue p1.s
+; STREAMING-NEXT: asr z0.s, z0.s, #31
+; STREAMING-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; STREAMING-NEXT: brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT: cntp x0, p0, p0.s
+; STREAMING-NEXT: // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT: ret
%res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
ret i32 %res
}
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 25f3540766618..48fbd14bd8540 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
; CHECK-LABEL: extract_row_b:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm guessing there's another PR to change the scalable vector specific instances of hasSVEorSME()
?
; FIXME: We shouldn't ever be emitting any SVE instructions when +sme is set but the function is not in streaming mode. | ||
; RUN: llc -mattr=+sme < %s | FileCheck %s --check-prefixes=NEON-NOSVE | ||
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,SVE2 | ||
; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefixes=NONEON-NOSVE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of the streaming-mode-fixed-length tests don't test the -mattr=+sme -force-streaming-compatible
configuration because it doesn't testing anything new. Can the RUN line be removed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well that was actually the point of adding this RUN line.
As you can see in the diff before this patch, before this patch:
- when compiling with
llc -force-streaming-compatible
this resulted in scalar-only code (as expected) - but when compiling with
llc -force-streaming-compatible -mattr=+sme
this resulted in an SVEdiv
instruction (wrong)
This case would not be covered withllc -mattr=+sme -force-streaming
, because that case can use all SVE instructions.
That said, I'm not sure if this RUN line is now protecting against a condition that was written wrongly in the first place, and is therefore not a case worth protecting. But I figured it wouldn't hurt to keep the RUN line.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"protecting against a condition that was written wrongly in the first place" is the view I took, hence recommending its removal. Alternative, if you want to be paranoid then all the streaming-mode-fixed-length tests should have the extra RUN line for the same rational.
@@ -1,7 +1,6 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | |||
; RUN: llc -mattr=+sve2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=SVE2 | |||
; FIXME: We shouldn't ever be emitting any SVE instructions when +sme is set but the function is not in streaming mode. | |||
; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefix=SVE2 | |||
; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be -force-streaming
?
; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefix=SVE | ||
; FIXME: We shouldn't ever be emitting any SVE instructions when +sme is set but the function is not in streaming mode. | ||
; RUN: llc -mattr=+sme < %s | FileCheck %s --check-prefix=SME | ||
; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-SVE-NOGATHER | ||
; RUN: llc -mattr=+sme -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this test use the same three RUN lines as the other streaming-mode-fixed-length tests?
bool IsStreaming = ForceStreaming || | ||
F.hasFnAttribute("aarch64_pstate_sm_enabled") || | ||
F.hasFnAttribute("aarch64_pstate_sm_body"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Extremely petty comment but for IsStreamingCompatible
you check the function attributes first followed by the command line option, but here the order is reversed. My OCD doesn't like this :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you happy with me fixing up the other case in this patch as well? I figured it seems more sensible to check the bool
first before calling some more compute-intensive function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, fine by me.
/// Returns true if the target has access to either the full range of SVE instructions, | ||
/// or the streaming-compatible subset of SVE instructions. | ||
bool isSVEorStreamingSVEAvailable() const { | ||
return hasSVE() || hasSMEFA64() || (hasSME() && isStreaming()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is SMEFA64
relevant? The feature means SVE and NEON instructions are available when in streaming mode, but you still need to be in streaming mode to execute them. Does it add any new information to the question this function is answering.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point, I agree that it shouldn't have been included here. I'll remove it.
// NEON doesn't support masked gathers/scatters, but SVE does. | ||
if (Subtarget->isSVEAvailable()) { | ||
for (auto VT : | ||
{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64, | ||
MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, | ||
MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) { | ||
setOperationAction(ISD::MGATHER, VT, Custom); | ||
setOperationAction(ISD::MSCATTER, VT, Custom); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be better to break this out rather than nesting incompatible feature checks. When navigating the code people might bypass the whole isSVEorStreamingSVEAvailable
block as not relevant and subsequently miss what they're looking for.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm happy to make that change, but it might be better to do this in a follow NFC patch. That way, we can group all MGATHER/MSCATTER nodes together (fixed/scalable) and move the VECREDUCE_SEQ_FADD into that block too.
What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rather it be done within this PR rather than introduce new code only to change it straight after. If you want to use this PR to fix the same issue with VECREDUCE_SEQ_FADD then I'm fine with that as well.
That's right. I didn't want to change everything in one big patch, but I'll follow this up afterwards. |
…ngth vectors. (llvm#96081) This also fixes the case where an SVE div is incorrectly to be assumed available in non-streaming mode with SME.
This also fixes the case where an SVE div is incorrectly to be assumed available in non-streaming mode with SME.