@@ -140,6 +140,11 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
140
140
cl::desc (" enable use of redzone on AArch64" ),
141
141
cl::init(false ), cl::Hidden);
142
142
143
+ static cl::opt<bool >
144
+ ReverseCSRRestoreSeq (" reverse-csr-restore-seq" ,
145
+ cl::desc (" reverse the CSR restore sequence" ),
146
+ cl::init(false ), cl::Hidden);
147
+
143
148
STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
144
149
145
150
// / This is the biggest offset to the stack pointer we can encode in aarch64
@@ -844,14 +849,32 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
844
849
Subtarget.isCallingConvWin64 (MF.getFunction ().getCallingConv ());
845
850
unsigned FixedObject = IsWin64 ? alignTo (AFI->getVarArgsGPRSize (), 16 ) : 0 ;
846
851
852
+ uint64_t AfterCSRPopSize = ArgumentPopSize;
847
853
auto PrologueSaveSize = AFI->getCalleeSavedStackSize () + FixedObject;
848
854
bool CombineSPBump = shouldCombineCSRLocalStackBump (MF, NumBytes);
849
-
850
- if (!CombineSPBump && PrologueSaveSize != 0 )
851
- convertCalleeSaveRestoreToSPPrePostIncDec (
852
- MBB, std::prev (MBB.getFirstTerminator ()), DL, TII, PrologueSaveSize);
855
+ // Assume we can't combine the last pop with the sp restore.
856
+
857
+ if (!CombineSPBump && PrologueSaveSize != 0 ) {
858
+ MachineBasicBlock::iterator Pop = std::prev (MBB.getFirstTerminator ());
859
+ // Converting the last ldp to a post-index ldp is valid only if the last
860
+ // ldp's offset is 0.
861
+ const MachineOperand &OffsetOp = Pop->getOperand (Pop->getNumOperands () - 1 );
862
+ // If the offset is 0, convert it to a post-index ldp.
863
+ if (OffsetOp.getImm () == 0 ) {
864
+ convertCalleeSaveRestoreToSPPrePostIncDec (MBB, Pop, DL, TII,
865
+ PrologueSaveSize);
866
+ } else {
867
+ // If not, make sure to emit an add after the last ldp.
868
+ // We're doing this by transfering the size to be restored from the
869
+ // adjustment *before* the CSR pops to the adjustment *after* the CSR
870
+ // pops.
871
+ AfterCSRPopSize += PrologueSaveSize;
872
+ }
873
+ }
853
874
854
875
// Move past the restores of the callee-saved registers.
876
+ // If we plan on combining the sp bump of the local stack size and the callee
877
+ // save stack size, we might need to adjust the CSR save and restore offsets.
855
878
MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator ();
856
879
MachineBasicBlock::iterator Begin = MBB.begin ();
857
880
while (LastPopI != Begin) {
@@ -866,7 +889,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
866
889
// If there is a single SP update, insert it before the ret and we're done.
867
890
if (CombineSPBump) {
868
891
emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
869
- NumBytes + ArgumentPopSize , TII,
892
+ NumBytes + AfterCSRPopSize , TII,
870
893
MachineInstr::FrameDestroy);
871
894
return ;
872
895
}
@@ -878,18 +901,18 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
878
901
bool RedZone = canUseRedZone (MF);
879
902
// If this was a redzone leaf function, we don't need to restore the
880
903
// stack pointer (but we may need to pop stack args for fastcc).
881
- if (RedZone && ArgumentPopSize == 0 )
904
+ if (RedZone && AfterCSRPopSize == 0 )
882
905
return ;
883
906
884
907
bool NoCalleeSaveRestore = PrologueSaveSize == 0 ;
885
908
int StackRestoreBytes = RedZone ? 0 : NumBytes;
886
909
if (NoCalleeSaveRestore)
887
- StackRestoreBytes += ArgumentPopSize ;
910
+ StackRestoreBytes += AfterCSRPopSize ;
888
911
emitFrameOffset (MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
889
912
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
890
913
// If we were able to combine the local stack pop with the argument pop,
891
914
// then we're done.
892
- if (NoCalleeSaveRestore || ArgumentPopSize == 0 )
915
+ if (NoCalleeSaveRestore || AfterCSRPopSize == 0 )
893
916
return ;
894
917
NumBytes = 0 ;
895
918
}
@@ -909,9 +932,37 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
909
932
// This must be placed after the callee-save restore code because that code
910
933
// assumes the SP is at the same location as it was after the callee-save save
911
934
// code in the prologue.
912
- if (ArgumentPopSize)
935
+ if (AfterCSRPopSize) {
936
+ // Sometimes (when we restore in the same order as we save), we can end up
937
+ // with code like this:
938
+ //
939
+ // ldp x26, x25, [sp]
940
+ // ldp x24, x23, [sp, #16]
941
+ // ldp x22, x21, [sp, #32]
942
+ // ldp x20, x19, [sp, #48]
943
+ // add sp, sp, #64
944
+ //
945
+ // In this case, it is always better to put the first ldp at the end, so
946
+ // that the load-store optimizer can run and merge the ldp and the add into
947
+ // a post-index ldp.
948
+ // If we managed to grab the first pop instruction, move it to the end.
949
+ if (LastPopI != Begin)
950
+ MBB.splice (MBB.getFirstTerminator (), &MBB, LastPopI);
951
+ // We should end up with something like this now:
952
+ //
953
+ // ldp x24, x23, [sp, #16]
954
+ // ldp x22, x21, [sp, #32]
955
+ // ldp x20, x19, [sp, #48]
956
+ // ldp x26, x25, [sp]
957
+ // add sp, sp, #64
958
+ //
959
+ // and the load-store optimizer can merge the last two instructions into:
960
+ //
961
+ // ldp x26, x25, [sp], #64
962
+ //
913
963
emitFrameOffset (MBB, MBB.getFirstTerminator (), DL, AArch64::SP, AArch64::SP,
914
- ArgumentPopSize, TII, MachineInstr::FrameDestroy);
964
+ AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
965
+ }
915
966
}
916
967
917
968
// / getFrameIndexReference - Provide a base+offset reference to an FI slot for
@@ -1180,9 +1231,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
1180
1231
1181
1232
computeCalleeSaveRegisterPairs (MF, CSI, TRI, RegPairs);
1182
1233
1183
- for (auto RPII = RegPairs.begin (), RPIE = RegPairs.end (); RPII != RPIE;
1184
- ++RPII) {
1185
- RegPairInfo RPI = *RPII;
1234
+ auto EmitMI = [&](const RegPairInfo &RPI) {
1186
1235
unsigned Reg1 = RPI.Reg1 ;
1187
1236
unsigned Reg2 = RPI.Reg2 ;
1188
1237
@@ -1221,7 +1270,14 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
1221
1270
MIB.addMemOperand (MF.getMachineMemOperand (
1222
1271
MachinePointerInfo::getFixedStack (MF, RPI.FrameIdx ),
1223
1272
MachineMemOperand::MOLoad, 8 , 8 ));
1224
- }
1273
+ };
1274
+
1275
+ if (ReverseCSRRestoreSeq)
1276
+ for (const RegPairInfo &RPI : reverse (RegPairs))
1277
+ EmitMI (RPI);
1278
+ else
1279
+ for (const RegPairInfo &RPI : RegPairs)
1280
+ EmitMI (RPI);
1225
1281
return true ;
1226
1282
}
1227
1283
0 commit comments