Skip to content

[AArch64] Consider runtime mode when deciding to use SVE for fixed-length vectors. #96081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 32 additions & 28 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1418,7 +1418,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}

if (Subtarget->hasSVEorSME()) {
if (Subtarget->isSVEorStreamingSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::BSWAP, VT, Custom);
Expand All @@ -1430,8 +1430,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
Expand Down Expand Up @@ -1528,14 +1526,13 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}

// NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
// NEON doesn't support masked loads/stores, but SME and SVE do.
for (auto VT :
{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}

// Firstly, exclude all scalable vector extending loads/truncating stores,
Expand Down Expand Up @@ -1576,8 +1573,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
Expand Down Expand Up @@ -1611,8 +1606,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
if (Subtarget->isSVEAvailable())
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom);
setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
Expand Down Expand Up @@ -1650,8 +1643,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,

for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
Expand All @@ -1675,18 +1666,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v1i64, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);

if (Subtarget->isSVEAvailable()) {
// NEON doesn't support across-vector reductions, but SVE does.
for (auto VT :
{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
}

// Histcnt is SVE2 only
if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable())
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
Custom);

// NOTE: Currently this has to happen after computeRegisterProperties rather
// than the preferred option of combining it with the addRegisterClass call.
if (Subtarget->useSVEForFixedLengthVectors()) {
Expand Down Expand Up @@ -1762,6 +1741,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
}

// Handle operations that are only available in non-streaming SVE mode.
if (Subtarget->isSVEAvailable()) {
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}

for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
MVT::v2f32, MVT::v4f32, MVT::v2f64})
setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);

// Histcnt is SVE2 only
if (Subtarget->hasSVE2())
setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
Custom);
}


if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
// Only required for llvm.aarch64.mops.memset.tag
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
Expand Down Expand Up @@ -6986,7 +6990,7 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(

// NEON-sized vectors can be emulated using SVE instructions.
if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
return Subtarget->hasSVEorSME();
return Subtarget->isSVEorStreamingSVEAvailable();

// Ensure NEON MVTs only belong to a single register class.
if (VT.getFixedSizeInBits() <= 128)
Expand Down
12 changes: 9 additions & 3 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
(hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
}

/// Returns true if the target has access to either the full range of SVE instructions,
/// or the streaming-compatible subset of SVE instructions.
bool isSVEorStreamingSVEAvailable() const {
return hasSVE() || (hasSME() && isStreaming());
}

unsigned getMinVectorRegisterBitWidth() const {
// Don't assume any minimum vector size when PSTATE.SM may not be 0, because
// we don't yet support streaming-compatible codegen support that we trust
Expand Down Expand Up @@ -374,11 +380,11 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
}

bool useSVEForFixedLengthVectors() const {
if (!isNeonAvailable())
return hasSVEorSME();
if (!isSVEorStreamingSVEAvailable())
return false;

// Prefer NEON unless larger SVE registers are available.
return hasSVEorSME() && getMinSVEVectorSizeInBits() >= 256;
return !isNeonAvailable() || getMinSVEVectorSizeInBits() >= 256;
}

bool useSVEForFixedLengthVectors(EVT VT) const {
Expand Down
13 changes: 9 additions & 4 deletions llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
"with zero meaning no minimum size is assumed."),
cl::init(0), cl::Hidden);

static cl::opt<bool> ForceStreaming(
"force-streaming",
cl::desc("Force the use of streaming code for all functions"),
cl::init(false), cl::Hidden);

static cl::opt<bool> ForceStreamingCompatible(
"force-streaming-compatible",
cl::desc("Force the use of streaming-compatible code for all functions"),
Expand Down Expand Up @@ -412,11 +417,11 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
bool HasMinSize = F.hasMinSize();

bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
bool IsStreaming = ForceStreaming ||
F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
F.hasFnAttribute("aarch64_pstate_sm_body");
Comment on lines +420 to 422
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extremely petty comment but for IsStreamingCompatible you check the function attributes first followed by the command line option, but here the order is reversed. My OCD doesn't like this :)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you happy with me fixing up the other case in this patch as well? I figured it seems more sensible to check the bool first before calling some more compute-intensive function.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, fine by me.

bool IsStreamingCompatible =
F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
ForceStreamingCompatible;
bool IsStreamingCompatible = ForceStreamingCompatible ||
F.hasFnAttribute("aarch64_pstate_sm_compatible");

unsigned MinSVEVectorSize = 0;
unsigned MaxSVEVectorSize = 0;
Expand Down
Loading
Loading