Skip to content

Commit c3a10dc

Browse files
authored
[AArch64] Disable consecutive store merging when Neon is unavailable (#111519)
Lowering fixed-size BUILD_VECTORS without Neon may introduce stack spills, leading to more stores/reloads than if the stores were not merged. In some cases, it can also prevent using paired store instructions. In the future, we may want to relax when SVE is available, but currently, the SVE lowerings for BUILD_VECTOR are limited to a few specific cases.
1 parent a2bd5db commit c3a10dc

File tree

3 files changed

+108
-10
lines changed

3 files changed

+108
-10
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27519,6 +27519,21 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
2751927519
return OptSize && !VT.isVector();
2752027520
}
2752127521

27522+
bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
27523+
const MachineFunction &MF) const {
27524+
// Avoid merging stores into fixed-length vectors when Neon is unavailable.
27525+
// In future, we could allow this when SVE is available, but currently,
27526+
// the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
27527+
// the general lowering may introduce stack spills/reloads).
27528+
if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
27529+
return false;
27530+
27531+
// Do not merge to float value size (128 bytes) if no implicit float attribute
27532+
// is set.
27533+
bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
27534+
return !NoFloat || MemVT.getSizeInBits() <= 64;
27535+
}
27536+
2752227537
bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
2752327538
// We want inc-of-add for scalars and sub-of-not for vectors.
2752427539
return VT.isScalarInteger();

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -846,16 +846,7 @@ class AArch64TargetLowering : public TargetLowering {
846846
bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
847847

848848
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
849-
const MachineFunction &MF) const override {
850-
// Do not merge to float value size (128 bytes) if no implicit
851-
// float attribute is set.
852-
853-
bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
854-
855-
if (NoFloat)
856-
return (MemVT.getSizeInBits() <= 64);
857-
return true;
858-
}
849+
const MachineFunction &MF) const override;
859850

860851
bool isCheapToSpeculateCttz(Type *) const override {
861852
return true;
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sme -O3 < %s -o - | FileCheck %s --check-prefixes=CHECK
3+
4+
; Tests consecutive stores of @llvm.aarch64.sve.faddv. Within SDAG faddv is
5+
; lowered as a FADDV + EXTRACT_VECTOR_ELT (of lane 0). Stores of extracts can
6+
; be matched by DAGCombiner::mergeConsecutiveStores(), which we want to avoid in
7+
; some cases as it can lead to worse codegen.
8+
9+
; TODO: A single `stp s0, s1, [x0]` may be preferred here.
10+
define void @consecutive_stores_pair(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) {
11+
; CHECK-LABEL: consecutive_stores_pair:
12+
; CHECK: // %bb.0:
13+
; CHECK-NEXT: ptrue p0.s
14+
; CHECK-NEXT: faddv s0, p0, z0.s
15+
; CHECK-NEXT: faddv s1, p0, z1.s
16+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
17+
; CHECK-NEXT: str d0, [x0]
18+
; CHECK-NEXT: ret
19+
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
20+
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
21+
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
22+
store float %reduce0, ptr %dest0, align 4
23+
store float %reduce1, ptr %dest1, align 4
24+
ret void
25+
}
26+
27+
define void @consecutive_stores_quadruple(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) {
28+
; CHECK-LABEL: consecutive_stores_quadruple:
29+
; CHECK: // %bb.0:
30+
; CHECK-NEXT: ptrue p0.s
31+
; CHECK-NEXT: faddv s0, p0, z0.s
32+
; CHECK-NEXT: faddv s1, p0, z1.s
33+
; CHECK-NEXT: faddv s2, p0, z2.s
34+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
35+
; CHECK-NEXT: faddv s3, p0, z3.s
36+
; CHECK-NEXT: mov v2.s[1], v3.s[0]
37+
; CHECK-NEXT: stp d0, d2, [x0]
38+
; CHECK-NEXT: ret
39+
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
40+
%dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
41+
%dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
42+
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
43+
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
44+
%reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
45+
%reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
46+
store float %reduce0, ptr %dest0, align 4
47+
store float %reduce1, ptr %dest1, align 4
48+
store float %reduce2, ptr %dest2, align 4
49+
store float %reduce3, ptr %dest3, align 4
50+
ret void
51+
}
52+
53+
define void @consecutive_stores_pair_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1) "aarch64_pstate_sm_enabled" {
54+
; CHECK-LABEL: consecutive_stores_pair_streaming_function:
55+
; CHECK: // %bb.0:
56+
; CHECK-NEXT: ptrue p0.s
57+
; CHECK-NEXT: faddv s0, p0, z0.s
58+
; CHECK-NEXT: faddv s1, p0, z1.s
59+
; CHECK-NEXT: stp s0, s1, [x0]
60+
; CHECK-NEXT: ret
61+
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
62+
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
63+
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
64+
store float %reduce0, ptr %dest0, align 4
65+
store float %reduce1, ptr %dest1, align 4
66+
ret void
67+
}
68+
69+
define void @consecutive_stores_quadruple_streaming_function(ptr %dest0, <vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2, <vscale x 4 x float> %vec3) "aarch64_pstate_sm_enabled" {
70+
; CHECK-LABEL: consecutive_stores_quadruple_streaming_function:
71+
; CHECK: // %bb.0:
72+
; CHECK-NEXT: ptrue p0.s
73+
; CHECK-NEXT: faddv s0, p0, z0.s
74+
; CHECK-NEXT: faddv s1, p0, z1.s
75+
; CHECK-NEXT: faddv s2, p0, z2.s
76+
; CHECK-NEXT: stp s0, s1, [x0]
77+
; CHECK-NEXT: faddv s3, p0, z3.s
78+
; CHECK-NEXT: stp s2, s3, [x0, #8]
79+
; CHECK-NEXT: ret
80+
%dest1 = getelementptr inbounds i8, ptr %dest0, i64 4
81+
%dest2 = getelementptr inbounds i8, ptr %dest1, i64 4
82+
%dest3 = getelementptr inbounds i8, ptr %dest2, i64 4
83+
%reduce0 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec0)
84+
%reduce1 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec1)
85+
%reduce2 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec2)
86+
%reduce3 = call float @llvm.aarch64.sve.faddv.nxv4f32(<vscale x 4 x i1> splat(i1 true), <vscale x 4 x float> %vec3)
87+
store float %reduce0, ptr %dest0, align 4
88+
store float %reduce1, ptr %dest1, align 4
89+
store float %reduce2, ptr %dest2, align 4
90+
store float %reduce3, ptr %dest3, align 4
91+
ret void
92+
}

0 commit comments

Comments
 (0)