Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit d31cc5d

Browse files
committed
[AArch64] Emit CSR loads in the same order as stores
Optionally allow the order of restoring the callee-saved registers in the epilogue to be reversed. The flag -reverse-csr-restore-seq generates the following code: ``` stp x26, x25, [sp, #-64]! stp x24, x23, [sp, #16] stp x22, x21, [sp, #32] stp x20, x19, [sp, #48] ; [..] ldp x24, x23, [sp, #16] ldp x22, x21, [sp, #32] ldp x20, x19, [sp, #48] ldp x26, x25, [sp], #64 ret ``` Note how the CSRs are restored in the same order as they are saved. One exception to this rule is the last `ldp`, which allows us to merge the stack adjustment and the ldp into a post-index ldp. This is done by first generating: ldp x26, x27, [sp] add sp, sp, #64 which gets merged by the arm64 load store optimizer into ldp x26, x25, [sp], #64 The flag is disabled by default. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@327569 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 96ae7f8 commit d31cc5d

File tree

2 files changed

+142
-14
lines changed

2 files changed

+142
-14
lines changed

lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 70 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
140140
cl::desc("enable use of redzone on AArch64"),
141141
cl::init(false), cl::Hidden);
142142

143+
static cl::opt<bool>
144+
ReverseCSRRestoreSeq("reverse-csr-restore-seq",
145+
cl::desc("reverse the CSR restore sequence"),
146+
cl::init(false), cl::Hidden);
147+
143148
STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
144149

145150
/// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -844,14 +849,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
844849
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
845850
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
846851

852+
uint64_t AfterCSRPopSize = ArgumentPopSize;
847853
auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
848854
bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
849-
850-
if (!CombineSPBump && PrologueSaveSize != 0)
851-
convertCalleeSaveRestoreToSPPrePostIncDec(
852-
MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);
855+
// Assume we can't combine the last pop with the sp restore.
856+
857+
if (!CombineSPBump && PrologueSaveSize != 0) {
858+
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
859+
// Converting the last ldp to a post-index ldp is valid only if the last
860+
// ldp's offset is 0.
861+
const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
862+
// If the offset is 0, convert it to a post-index ldp.
863+
if (OffsetOp.getImm() == 0) {
864+
convertCalleeSaveRestoreToSPPrePostIncDec(MBB, Pop, DL, TII,
865+
PrologueSaveSize);
866+
} else {
867+
// If not, make sure to emit an add after the last ldp.
868+
// We're doing this by transfering the size to be restored from the
869+
// adjustment *before* the CSR pops to the adjustment *after* the CSR
870+
// pops.
871+
AfterCSRPopSize += PrologueSaveSize;
872+
}
873+
}
853874

854875
// Move past the restores of the callee-saved registers.
876+
// If we plan on combining the sp bump of the local stack size and the callee
877+
// save stack size, we might need to adjust the CSR save and restore offsets.
855878
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
856879
MachineBasicBlock::iterator Begin = MBB.begin();
857880
while (LastPopI != Begin) {
@@ -866,7 +889,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
866889
// If there is a single SP update, insert it before the ret and we're done.
867890
if (CombineSPBump) {
868891
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
869-
NumBytes + ArgumentPopSize, TII,
892+
NumBytes + AfterCSRPopSize, TII,
870893
MachineInstr::FrameDestroy);
871894
return;
872895
}
@@ -878,18 +901,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
878901
bool RedZone = canUseRedZone(MF);
879902
// If this was a redzone leaf function, we don't need to restore the
880903
// stack pointer (but we may need to pop stack args for fastcc).
881-
if (RedZone && ArgumentPopSize == 0)
904+
if (RedZone && AfterCSRPopSize == 0)
882905
return;
883906

884907
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
885908
int StackRestoreBytes = RedZone ? 0 : NumBytes;
886909
if (NoCalleeSaveRestore)
887-
StackRestoreBytes += ArgumentPopSize;
910+
StackRestoreBytes += AfterCSRPopSize;
888911
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
889912
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
890913
// If we were able to combine the local stack pop with the argument pop,
891914
// then we're done.
892-
if (NoCalleeSaveRestore || ArgumentPopSize == 0)
915+
if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
893916
return;
894917
NumBytes = 0;
895918
}
@@ -909,9 +932,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
909932
// This must be placed after the callee-save restore code because that code
910933
// assumes the SP is at the same location as it was after the callee-save save
911934
// code in the prologue.
912-
if (ArgumentPopSize)
935+
if (AfterCSRPopSize) {
936+
// Sometimes (when we restore in the same order as we save), we can end up
937+
// with code like this:
938+
//
939+
// ldp x26, x25, [sp]
940+
// ldp x24, x23, [sp, #16]
941+
// ldp x22, x21, [sp, #32]
942+
// ldp x20, x19, [sp, #48]
943+
// add sp, sp, #64
944+
//
945+
// In this case, it is always better to put the first ldp at the end, so
946+
// that the load-store optimizer can run and merge the ldp and the add into
947+
// a post-index ldp.
948+
// If we managed to grab the first pop instruction, move it to the end.
949+
if (LastPopI != Begin)
950+
MBB.splice(MBB.getFirstTerminator(), &MBB, LastPopI);
951+
// We should end up with something like this now:
952+
//
953+
// ldp x24, x23, [sp, #16]
954+
// ldp x22, x21, [sp, #32]
955+
// ldp x20, x19, [sp, #48]
956+
// ldp x26, x25, [sp]
957+
// add sp, sp, #64
958+
//
959+
// and the load-store optimizer can merge the last two instructions into:
960+
//
961+
// ldp x26, x25, [sp], #64
962+
//
913963
emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
914-
ArgumentPopSize, TII, MachineInstr::FrameDestroy);
964+
AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
965+
}
915966
}
916967

917968
/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1180,9 +1231,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
11801231

11811232
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
11821233

1183-
for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
1184-
++RPII) {
1185-
RegPairInfo RPI = *RPII;
1234+
auto EmitMI = [&](const RegPairInfo &RPI) {
11861235
unsigned Reg1 = RPI.Reg1;
11871236
unsigned Reg2 = RPI.Reg2;
11881237

@@ -1221,7 +1270,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
12211270
MIB.addMemOperand(MF.getMachineMemOperand(
12221271
MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
12231272
MachineMemOperand::MOLoad, 8, 8));
1224-
}
1273+
};
1274+
1275+
if (ReverseCSRRestoreSeq)
1276+
for (const RegPairInfo &RPI : reverse(RegPairs))
1277+
EmitMI(RPI);
1278+
else
1279+
for (const RegPairInfo &RPI : RegPairs)
1280+
EmitMI(RPI);
12251281
return true;
12261282
}
12271283

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,BEFORELDSTOPT
2+
# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK,AFTERLDSTOPT
3+
#
4+
--- |
5+
6+
define void @foo() nounwind { entry: unreachable }
7+
8+
define void @bar() nounwind { entry: unreachable }
9+
10+
...
11+
---
12+
name: foo
13+
# CHECK-LABEL: name: foo
14+
tracksRegLiveness: true
15+
body: |
16+
bb.0:
17+
$x19 = IMPLICIT_DEF
18+
$x20 = IMPLICIT_DEF
19+
$x21 = IMPLICIT_DEF
20+
$x22 = IMPLICIT_DEF
21+
$x23 = IMPLICIT_DEF
22+
$x24 = IMPLICIT_DEF
23+
$x25 = IMPLICIT_DEF
24+
$x26 = IMPLICIT_DEF
25+
26+
; The local stack size is 0, so the last ldp in the sequence will also
27+
; restore the stack.
28+
; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
29+
; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
30+
; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
31+
32+
; Before running the load-store optimizer, we emit a ldp and an add.
33+
; BEFORELDSTOPT-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 0
34+
; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 64, 0
35+
36+
; We want to make sure that after running the load-store optimizer, the ldp
37+
; and the add get merged into a post-index ldp.
38+
; AFTERLDSTOPT-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
39+
40+
RET_ReallyLR
41+
...
42+
---
43+
name: bar
44+
# CHECK-LABEL: name: bar
45+
tracksRegLiveness: true
46+
stack:
47+
- { id : 0, size: 8, alignment: 4,
48+
stack-id: 0, callee-saved-register: '', callee-saved-restored: true,
49+
local-offset: -4, di-variable: '', di-expression: '', di-location: '' }
50+
51+
body: |
52+
bb.0:
53+
$x19 = IMPLICIT_DEF
54+
$x20 = IMPLICIT_DEF
55+
$x21 = IMPLICIT_DEF
56+
$x22 = IMPLICIT_DEF
57+
$x23 = IMPLICIT_DEF
58+
$x24 = IMPLICIT_DEF
59+
$x25 = IMPLICIT_DEF
60+
$x26 = IMPLICIT_DEF
61+
62+
; The local stack size is not 0, and we can combine the CSR stack size with
63+
; the local stack size. This results in rewriting the offsets for all the
64+
; save/restores and forbids us to merge the stack adjustment and the last pop.
65+
; In this case, there is no point of moving the first CSR pair at the end.
66+
; CHECK: $x26, $x25 = frame-destroy LDPXi $sp, 2
67+
; CHECK-NEXT: $x24, $x23 = frame-destroy LDPXi $sp, 4
68+
; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
69+
; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
70+
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
71+
RET_ReallyLR
72+
...

0 commit comments

Comments
 (0)