Skip to content

Commit a629322

Browse files
committed
Reland "[AArch64][SME] Add support for Copy/Spill/Fill of strided ZPR2/ZPR4 registers."
This patch contains a few changes: * It changes the alignment of the strided/contiguous ZPR2/ZPR4 registers to 128-bits. This is important, because when we spill these registers to the stack, the address doesn't need to be 256/512 bits aligned because we split the single-store/reload pseudo instruction up into multiple STR_ZXI/LDR_ZXI (single vector store/load) instructions, which only require a 128-bit alignment. Additionally, an alignment larger than the stack-alignment is not supported for scalable vectors. * It adds support for these register classes in storeRegToStackSlot, loadRegFromStackSlot and copyPhysReg. * It adds tests only for the strided forms. There is no need to also test the contiguous forms, because a register such as z2_z3 or z4_z5_z6_z7 are also part of the regular ZPR2 and ZPR4 register classes, respectively, which are already covered and tested. Reviewed By: dtemirbulatov Differential Revision: https://reviews.llvm.org/D159189
1 parent 81dc54e commit a629322

File tree

5 files changed

+169
-16
lines changed

5 files changed

+169
-16
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3669,8 +3669,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
36693669
}
36703670

36713671
// Copy a Z register pair by copying the individual sub-registers.
3672-
if (AArch64::ZPR2RegClass.contains(DestReg) &&
3673-
AArch64::ZPR2RegClass.contains(SrcReg)) {
3672+
if ((AArch64::ZPR2RegClass.contains(DestReg) ||
3673+
AArch64::ZPR2StridedOrContiguousRegClass.contains(DestReg)) &&
3674+
(AArch64::ZPR2RegClass.contains(SrcReg) ||
3675+
AArch64::ZPR2StridedOrContiguousRegClass.contains(SrcReg))) {
36743676
assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
36753677
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1};
36763678
copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORR_ZZZ,
@@ -3690,8 +3692,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
36903692
}
36913693

36923694
// Copy a Z register quad by copying the individual sub-registers.
3693-
if (AArch64::ZPR4RegClass.contains(DestReg) &&
3694-
AArch64::ZPR4RegClass.contains(SrcReg)) {
3695+
if ((AArch64::ZPR4RegClass.contains(DestReg) ||
3696+
AArch64::ZPR4StridedOrContiguousRegClass.contains(DestReg)) &&
3697+
(AArch64::ZPR4RegClass.contains(SrcReg) ||
3698+
AArch64::ZPR4StridedOrContiguousRegClass.contains(SrcReg))) {
36953699
assert(Subtarget.hasSVEorSME() && "Unexpected SVE register.");
36963700
static const unsigned Indices[] = {AArch64::zsub0, AArch64::zsub1,
36973701
AArch64::zsub2, AArch64::zsub3};
@@ -4022,7 +4026,8 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
40224026
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
40234027
Opc = AArch64::ST1Twov2d;
40244028
Offset = false;
4025-
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
4029+
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4030+
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
40264031
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
40274032
Opc = AArch64::STR_ZZXI;
40284033
StackID = TargetStackID::ScalableVector;
@@ -4044,7 +4049,8 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
40444049
assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
40454050
Opc = AArch64::ST1Fourv2d;
40464051
Offset = false;
4047-
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
4052+
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4053+
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
40484054
assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
40494055
Opc = AArch64::STR_ZZZZXI;
40504056
StackID = TargetStackID::ScalableVector;
@@ -4178,7 +4184,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
41784184
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
41794185
Opc = AArch64::LD1Twov2d;
41804186
Offset = false;
4181-
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
4187+
} else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
4188+
AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
41824189
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
41834190
Opc = AArch64::LDR_ZZXI;
41844191
StackID = TargetStackID::ScalableVector;
@@ -4200,7 +4207,8 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
42004207
assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
42014208
Opc = AArch64::LD1Fourv2d;
42024209
Offset = false;
4203-
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
4210+
} else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
4211+
AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
42044212
assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
42054213
Opc = AArch64::LDR_ZZZZXI;
42064214
StackID = TargetStackID::ScalableVector;

llvm/lib/Target/AArch64/AArch64RegisterInfo.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1331,16 +1331,16 @@ def ZStridedQuadsHi : RegisterTuples<[zsub0, zsub1, zsub2, zsub3], [
13311331
(trunc (rotl ZPR, 24), 4), (trunc (rotl ZPR, 28), 4)
13321332
]>;
13331333

1334-
def ZPR2Strided : RegisterClass<"AArch64", [untyped], 256,
1334+
def ZPR2Strided : RegisterClass<"AArch64", [untyped], 128,
13351335
(add ZStridedPairsLo, ZStridedPairsHi)> {
13361336
let Size = 256;
13371337
}
1338-
def ZPR4Strided : RegisterClass<"AArch64", [untyped], 512,
1338+
def ZPR4Strided : RegisterClass<"AArch64", [untyped], 128,
13391339
(add ZStridedQuadsLo, ZStridedQuadsHi)> {
13401340
let Size = 512;
13411341
}
13421342

1343-
def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 256,
1343+
def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128,
13441344
(add ZStridedPairsLo, ZStridedPairsHi,
13451345
(decimate ZSeqPairs, 2))> {
13461346
let Size = 256;
@@ -1387,7 +1387,7 @@ let EncoderMethod = "EncodeZPR2StridedRegisterClass",
13871387
: RegisterOperand<ZPR2StridedOrContiguous, "printTypedVectorList<0,'d'>">;
13881388
}
13891389

1390-
def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 512,
1390+
def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 128,
13911391
(add ZStridedQuadsLo, ZStridedQuadsHi,
13921392
(decimate ZSeqQuads, 4))> {
13931393
let Size = 512;

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2311,14 +2311,14 @@ let Predicates = [HasSVEorSME] in {
23112311
// These get expanded to individual LDR_ZXI/STR_ZXI instructions in
23122312
// AArch64ExpandPseudoInsts.
23132313
let mayLoad = 1, hasSideEffects = 0 in {
2314-
def LDR_ZZXI : Pseudo<(outs ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2314+
def LDR_ZZXI : Pseudo<(outs ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
23152315
def LDR_ZZZXI : Pseudo<(outs ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2316-
def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2316+
def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
23172317
}
23182318
let mayStore = 1, hasSideEffects = 0 in {
2319-
def STR_ZZXI : Pseudo<(outs), (ins ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2319+
def STR_ZZXI : Pseudo<(outs), (ins ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
23202320
def STR_ZZZXI : Pseudo<(outs), (ins ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2321-
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
2321+
def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
23222322
}
23232323

23242324
let AddedComplexity = 1 in {

llvm/test/CodeGen/AArch64/spillfill-sve.mir

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
1010
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
1111
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
12+
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
1213
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
1314
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
15+
define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
1416

1517
attributes #0 = { nounwind "target-features"="+sve" }
1618

@@ -131,6 +133,51 @@ body: |
131133
RET_ReallyLR
132134
...
133135
---
136+
name: spills_fills_stack_id_zpr2strided
137+
tracksRegLiveness: true
138+
registers:
139+
- { id: 0, class: zpr2strided }
140+
stack:
141+
liveins:
142+
- { reg: '$z0_z8', virtual-reg: '%0' }
143+
body: |
144+
bb.0.entry:
145+
liveins: $z0_z1
146+
successors: %bb.1
147+
148+
$z0_z8 = COPY $z0_z1
149+
150+
B %bb.1
151+
152+
bb.1:
153+
liveins: $z0_z8
154+
155+
; CHECK-LABEL: name: spills_fills_stack_id_zpr2strided
156+
; CHECK: stack:
157+
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 32, alignment: 16
158+
; CHECK-NEXT: stack-id: scalable-vector
159+
160+
; EXPAND-LABEL: name: spills_fills_stack_id_zpr2strided
161+
; EXPAND: STR_ZXI $z0, $sp, 0
162+
; EXPAND: STR_ZXI $z8, $sp, 1
163+
; EXPAND: $z0 = LDR_ZXI $sp, 0
164+
; EXPAND: $z8 = LDR_ZXI $sp, 1
165+
166+
%0:zpr2strided = COPY $z0_z8
167+
168+
$z0_z1_z2_z3 = IMPLICIT_DEF
169+
$z4_z5_z6_z7 = IMPLICIT_DEF
170+
$z8_z9_z10_z11 = IMPLICIT_DEF
171+
$z12_z13_z14_z15 = IMPLICIT_DEF
172+
$z16_z17_z18_z19 = IMPLICIT_DEF
173+
$z20_z21_z22_z23 = IMPLICIT_DEF
174+
$z24_z25_z26_z27 = IMPLICIT_DEF
175+
$z28_z29_z30_z31 = IMPLICIT_DEF
176+
177+
$z0_z8 = COPY %0
178+
RET_ReallyLR
179+
...
180+
---
134181
name: spills_fills_stack_id_zpr3
135182
tracksRegLiveness: true
136183
registers:
@@ -210,3 +257,51 @@ body: |
210257
$z0_z1_z2_z3 = COPY %0
211258
RET_ReallyLR
212259
...
260+
---
261+
name: spills_fills_stack_id_zpr4strided
262+
tracksRegLiveness: true
263+
registers:
264+
- { id: 0, class: zpr4strided }
265+
stack:
266+
liveins:
267+
- { reg: '$z0_z4_z8_z12', virtual-reg: '%0' }
268+
body: |
269+
bb.0.entry:
270+
liveins: $z0_z1_z2_z3
271+
272+
$z0_z4_z8_z12 = COPY $z0_z1_z2_z3
273+
274+
B %bb.1
275+
276+
bb.1:
277+
liveins: $z0_z4_z8_z12
278+
279+
; CHECK-LABEL: name: spills_fills_stack_id_zpr4strided
280+
; CHECK: stack:
281+
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 64, alignment: 16
282+
; CHECK-NEXT: stack-id: scalable-vector
283+
284+
; EXPAND-LABEL: name: spills_fills_stack_id_zpr4strided
285+
; EXPAND: STR_ZXI $z0, $sp, 0
286+
; EXPAND: STR_ZXI $z4, $sp, 1
287+
; EXPAND: STR_ZXI $z8, $sp, 2
288+
; EXPAND: STR_ZXI $z12, $sp, 3
289+
; EXPAND: $z0 = LDR_ZXI $sp, 0
290+
; EXPAND: $z4 = LDR_ZXI $sp, 1
291+
; EXPAND: $z8 = LDR_ZXI $sp, 2
292+
; EXPAND: $z12 = LDR_ZXI $sp, 3
293+
294+
%0:zpr4strided = COPY $z0_z4_z8_z12
295+
296+
$z0_z1_z2_z3 = IMPLICIT_DEF
297+
$z4_z5_z6_z7 = IMPLICIT_DEF
298+
$z8_z9_z10_z11 = IMPLICIT_DEF
299+
$z12_z13_z14_z15 = IMPLICIT_DEF
300+
$z16_z17_z18_z19 = IMPLICIT_DEF
301+
$z20_z21_z22_z23 = IMPLICIT_DEF
302+
$z24_z25_z26_z27 = IMPLICIT_DEF
303+
$z28_z29_z30_z31 = IMPLICIT_DEF
304+
305+
$z0_z4_z8_z12 = COPY %0
306+
RET_ReallyLR
307+
...

llvm/test/CodeGen/AArch64/sve-copy-zprpair.mir

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,29 @@ body: |
2323
$z0_z1 = COPY $z1_z2
2424
RET_ReallyLR
2525
26+
...
27+
---
28+
name: copy_zpr2strided
29+
alignment: 4
30+
tracksRegLiveness: true
31+
liveins:
32+
- { reg: '$z0_z1' }
33+
frameInfo:
34+
maxCallFrameSize: 0
35+
body: |
36+
bb.0:
37+
liveins: $z0_z1
38+
; CHECK-LABEL: name: copy_zpr2strided
39+
; CHECK: liveins: $z0_z1
40+
; CHECK: $z8 = ORR_ZZZ $z1, $z1
41+
; CHECK: $z0 = ORR_ZZZ $z0, $z0
42+
; CHECK: $z1 = ORR_ZZZ $z8, $z8
43+
; CHECK: $z0 = ORR_ZZZ $z0, $z0
44+
; CHECK: RET_ReallyLR
45+
$z0_z8 = COPY $z0_z1
46+
$z0_z1 = COPY $z0_z8
47+
RET_ReallyLR
48+
2649
...
2750
---
2851
name: copy_zpr3
@@ -76,3 +99,30 @@ body: |
7699
RET_ReallyLR
77100
78101
...
102+
---
103+
name: copy_zpr4strided
104+
alignment: 4
105+
tracksRegLiveness: true
106+
liveins:
107+
- { reg: '$z0_z1_z2_z3' }
108+
frameInfo:
109+
maxCallFrameSize: 0
110+
body: |
111+
bb.0:
112+
liveins: $z0_z1_z2_z3
113+
; CHECK-LABEL: name: copy_zpr4
114+
; CHECK: liveins: $z0_z1_z2_z3
115+
; CHECK: $z12 = ORR_ZZZ $z3, $z3
116+
; CHECK: $z8 = ORR_ZZZ $z2, $z2
117+
; CHECK: $z4 = ORR_ZZZ $z1, $z1
118+
; CHECK: $z0 = ORR_ZZZ $z0, $z0
119+
; CHECK: $z3 = ORR_ZZZ $z12, $z12
120+
; CHECK: $z2 = ORR_ZZZ $z8, $z8
121+
; CHECK: $z1 = ORR_ZZZ $z4, $z4
122+
; CHECK: $z0 = ORR_ZZZ $z0, $z0
123+
; CHECK: RET_ReallyLR
124+
$z0_z4_z8_z12 = COPY $z0_z1_z2_z3
125+
$z0_z1_z2_z3 = COPY $z0_z4_z8_z12
126+
RET_ReallyLR
127+
128+
...

0 commit comments

Comments
 (0)