52
52
// | async context if needed |
53
53
// | (a.k.a. "frame record") |
54
54
// |-----------------------------------| <- fp(=x29)
55
+ // | <hazard padding> |
56
+ // |-----------------------------------|
55
57
// | |
56
58
// | callee-saved fp/simd/SVE regs |
57
59
// | |
64
66
// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65
67
// |.the.standard.16-byte.alignment....| compile time; if present)
66
68
// |-----------------------------------|
67
- // | |
68
69
// | local variables of fixed size |
69
70
// | including spill slots |
71
+ // | <FPR> |
72
+ // | <hazard padding> |
73
+ // | <GPR> |
70
74
// |-----------------------------------| <- bp(not defined by ABI,
71
75
// |.variable-sized.local.variables....| LLVM chooses X19)
72
76
// |.(VLAs)............................| (size of this area is unknown at
117
121
//
118
122
// FIXME: also explain the redzone concept.
119
123
//
124
+ // About stack hazards: Under some SME contexts, a coprocessor with its own
125
+ // separate cache can used for FP operations. This can create hazards if the CPU
126
+ // and the SME unit try to access the same area of memory, including if the
127
+ // access is to an area of the stack. To try to alleviate this we attempt to
128
+ // introduce extra padding into the stack frame between FP and GPR accesses,
129
+ // controlled by the StackHazardSize option. Without changing the layout of the
130
+ // stack frame in the diagram above, a stack object of size StackHazardSize is
131
+ // added between GPR and FPR CSRs. Another is added to the stack objects
132
+ // section, and stack objects are sorted so that FPR > Hazard padding slot >
133
+ // GPRs (where possible). Unfortunately some things are not handled well (VLA
134
+ // area, arguments on the stack, object with both GPR and FPR accesses), but if
135
+ // those are controlled by the user then the entire stack frame becomes GPR at
136
+ // the start/end with FPR in the middle, surrounded by Hazard padding.
137
+ //
120
138
// An example of the prologue:
121
139
//
122
140
// .globl __foo
196
214
#include " llvm/ADT/ScopeExit.h"
197
215
#include " llvm/ADT/SmallVector.h"
198
216
#include " llvm/ADT/Statistic.h"
217
+ #include " llvm/Analysis/ValueTracking.h"
199
218
#include " llvm/CodeGen/LivePhysRegs.h"
200
219
#include " llvm/CodeGen/MachineBasicBlock.h"
201
220
#include " llvm/CodeGen/MachineFrameInfo.h"
@@ -253,6 +272,14 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
253
272
cl::desc (" Emit homogeneous prologue and epilogue for the size "
254
273
" optimization (default = off)" ));
255
274
275
+ // Stack hazard padding size. 0 = disabled.
276
+ static cl::opt<unsigned > StackHazardSize (" aarch64-stack-hazard-size" ,
277
+ cl::init (0 ), cl::Hidden);
278
+ // Whether to insert padding into non-streaming functions (for testing).
279
+ static cl::opt<bool >
280
+ StackHazardInNonStreaming (" aarch64-stack-hazard-in-non-streaming" ,
281
+ cl::init (false ), cl::Hidden);
282
+
256
283
STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
257
284
258
285
// / Returns how much of the incoming argument stack area (in bytes) we should
@@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1461
1488
// update in so create a normal arithmetic instruction instead.
1462
1489
if (MBBI->getOperand (MBBI->getNumOperands () - 1 ).getImm () != 0 ||
1463
1490
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1491
+ // If we are destroying the frame, make sure we add the increment after the
1492
+ // last frame operation.
1493
+ if (FrameFlag == MachineInstr::FrameDestroy)
1494
+ ++MBBI;
1464
1495
emitFrameOffset (MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1465
1496
StackOffset::getFixed (CSStackSizeInc), TII, FrameFlag,
1466
1497
false , false , nullptr , EmitCFI,
@@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs(
2901
2932
}
2902
2933
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize ();
2903
2934
bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace ();
2935
+ Register LastReg = 0 ;
2904
2936
2905
2937
// When iterating backwards, the loop condition relies on unsigned wraparound.
2906
2938
for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs(
2922
2954
else
2923
2955
llvm_unreachable (" Unsupported register class." );
2924
2956
2957
+ // Add the stack hazard size as we transition from GPR->FPR CSRs.
2958
+ if (AFI->hasStackHazardSlotIndex () &&
2959
+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
2960
+ AArch64InstrInfo::isFpOrNEON (RPI.Reg1 ))
2961
+ ByteOffset += StackFillDir * StackHazardSize;
2962
+ LastReg = RPI.Reg1 ;
2963
+
2925
2964
// Add the next reg to the pair if it is in the same register class.
2926
- if (unsigned (i + RegInc) < Count) {
2965
+ if (unsigned (i + RegInc) < Count && !AFI-> hasStackHazardSlotIndex () ) {
2927
2966
Register NextReg = CSI[i + RegInc].getReg ();
2928
2967
bool IsFirst = i == FirstReg;
2929
2968
switch (RPI.Type ) {
@@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs(
3034
3073
Offset += 8 ;
3035
3074
RPI.Offset = Offset / Scale;
3036
3075
3037
- assert (((!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3076
+ assert ((!RPI.isPaired () ||
3077
+ (!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3038
3078
(RPI.isScalable () && RPI.Offset >= -256 && RPI.Offset <= 255 )) &&
3039
3079
" Offset out of bounds for LDP/STP immediate" );
3040
3080
@@ -3455,6 +3495,80 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3455
3495
return true ;
3456
3496
}
3457
3497
3498
+ // Return the FrameID for a Load/Store instruction by looking at the MMO.
3499
+ static std::optional<int > getLdStFrameID (const MachineInstr &MI,
3500
+ const MachineFrameInfo &MFI) {
3501
+ if (!MI.mayLoadOrStore () || MI.getNumMemOperands () < 1 )
3502
+ return std::nullopt;
3503
+
3504
+ MachineMemOperand *MMO = *MI.memoperands_begin ();
3505
+ auto *PSV =
3506
+ dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue ());
3507
+ if (PSV)
3508
+ return std::optional<int >(PSV->getFrameIndex ());
3509
+
3510
+ if (MMO->getValue ()) {
3511
+ if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject (MMO->getValue ()))) {
3512
+ for (int FI = MFI.getObjectIndexBegin (); FI < MFI.getObjectIndexEnd ();
3513
+ FI++)
3514
+ if (MFI.getObjectAllocation (FI) == Al)
3515
+ return FI;
3516
+ }
3517
+ }
3518
+
3519
+ return std::nullopt;
3520
+ }
3521
+
3522
+ // Check if a Hazard slot is needed for the current function, and if so create
3523
+ // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3524
+ // which can be used to determine if any hazard padding is needed.
3525
+ void AArch64FrameLowering::determineStackHazardSlot (
3526
+ MachineFunction &MF, BitVector &SavedRegs) const {
3527
+ if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3528
+ MF.getInfo <AArch64FunctionInfo>()->hasStackHazardSlotIndex ())
3529
+ return ;
3530
+
3531
+ // Stack hazards are only needed in streaming functions.
3532
+ SMEAttrs Attrs (MF.getFunction ());
3533
+ if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody ())
3534
+ return ;
3535
+
3536
+ MachineFrameInfo &MFI = MF.getFrameInfo ();
3537
+
3538
+ // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3539
+ // stack objects.
3540
+ bool HasFPRCSRs = any_of (SavedRegs.set_bits (), [](unsigned Reg) {
3541
+ return AArch64::FPR64RegClass.contains (Reg) ||
3542
+ AArch64::FPR128RegClass.contains (Reg) ||
3543
+ AArch64::ZPRRegClass.contains (Reg) ||
3544
+ AArch64::PPRRegClass.contains (Reg);
3545
+ });
3546
+ bool HasFPRStackObjects = false ;
3547
+ if (!HasFPRCSRs) {
3548
+ std::vector<unsigned > FrameObjects (MFI.getObjectIndexEnd ());
3549
+ for (auto &MBB : MF) {
3550
+ for (auto &MI : MBB) {
3551
+ std::optional<int > FI = getLdStFrameID (MI, MFI);
3552
+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
3553
+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
3554
+ FrameObjects[*FI] |= 2 ;
3555
+ else
3556
+ FrameObjects[*FI] |= 1 ;
3557
+ }
3558
+ }
3559
+ }
3560
+ HasFPRStackObjects =
3561
+ any_of (FrameObjects, [](unsigned B) { return (B & 3 ) == 2 ; });
3562
+ }
3563
+
3564
+ if (HasFPRCSRs || HasFPRStackObjects) {
3565
+ int ID = MFI.CreateStackObject (StackHazardSize, Align (16 ), false );
3566
+ LLVM_DEBUG (dbgs () << " Created Hazard slot at " << ID << " size "
3567
+ << StackHazardSize << " \n " );
3568
+ MF.getInfo <AArch64FunctionInfo>()->setStackHazardSlotIndex (ID);
3569
+ }
3570
+ }
3571
+
3458
3572
void AArch64FrameLowering::determineCalleeSaves (MachineFunction &MF,
3459
3573
BitVector &SavedRegs,
3460
3574
RegScavenger *RS) const {
@@ -3595,6 +3709,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3595
3709
CSStackSize += 8 ;
3596
3710
}
3597
3711
3712
+ // Determine if a Hazard slot should be used, and increase the CSStackSize by
3713
+ // StackHazardSize if so.
3714
+ determineStackHazardSlot (MF, SavedRegs);
3715
+ if (AFI->hasStackHazardSlotIndex ())
3716
+ CSStackSize += StackHazardSize;
3717
+
3598
3718
// Save number of saved regs, so we can easily update CSStackSize later.
3599
3719
unsigned NumSavedRegs = SavedRegs.count ();
3600
3720
@@ -3761,10 +3881,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3761
3881
CSI.insert (CSI.end (), VGSaves.begin (), VGSaves.end ());
3762
3882
}
3763
3883
3884
+ Register LastReg = 0 ;
3885
+ int HazardSlotIndex = std::numeric_limits<int >::max ();
3764
3886
for (auto &CS : CSI) {
3765
3887
Register Reg = CS.getReg ();
3766
3888
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass (Reg);
3767
3889
3890
+ // Create a hazard slot as we switch between GPR and FPR CSRs.
3891
+ if (AFI->hasStackHazardSlotIndex () &&
3892
+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
3893
+ AArch64InstrInfo::isFpOrNEON (Reg)) {
3894
+ assert (HazardSlotIndex == std::numeric_limits<int >::max () &&
3895
+ " Unexpected register order for hazard slot" );
3896
+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3897
+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3898
+ << " \n " );
3899
+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3900
+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3901
+ MinCSFrameIndex = HazardSlotIndex;
3902
+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3903
+ MaxCSFrameIndex = HazardSlotIndex;
3904
+ }
3905
+
3768
3906
unsigned Size = RegInfo->getSpillSize (*RC);
3769
3907
Align Alignment (RegInfo->getSpillAlign (*RC));
3770
3908
int FrameIdx = MFI.CreateStackObject (Size , Alignment, true );
@@ -3785,7 +3923,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3785
3923
if ((unsigned )FrameIdx > MaxCSFrameIndex)
3786
3924
MaxCSFrameIndex = FrameIdx;
3787
3925
}
3926
+ LastReg = Reg;
3927
+ }
3928
+
3929
+ // Add hazard slot in the case where no FPR CSRs are present.
3930
+ if (AFI->hasStackHazardSlotIndex () &&
3931
+ HazardSlotIndex == std::numeric_limits<int >::max ()) {
3932
+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3933
+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3934
+ << " \n " );
3935
+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3936
+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3937
+ MinCSFrameIndex = HazardSlotIndex;
3938
+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3939
+ MaxCSFrameIndex = HazardSlotIndex;
3788
3940
}
3941
+
3789
3942
return true ;
3790
3943
}
3791
3944
@@ -3798,6 +3951,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
3798
3951
// function doesn't use a FP.
3799
3952
if (AFI->hasStreamingModeChanges () && !hasFP (MF))
3800
3953
return false ;
3954
+ // Don't allow register salvaging with hazard slots, in case it moves objects
3955
+ // into the wrong place.
3956
+ if (AFI->hasStackHazardSlotIndex ())
3957
+ return false ;
3801
3958
return AFI->hasCalleeSaveStackFreeSpace ();
3802
3959
}
3803
3960
@@ -4492,6 +4649,11 @@ struct FrameObject {
4492
4649
// This object's group (which always contains the object with
4493
4650
// ObjectFirst==true) should be placed first.
4494
4651
bool GroupFirst = false ;
4652
+
4653
+ // Used to distinguish between FP and GPR accesses. The values are decided so
4654
+ // that they sort FPR < Hazard < GPR and they can be or'd together.
4655
+ unsigned Accesses = 0 ;
4656
+ enum { AccessFPR = 1 , AccessHazard = 2 , AccessGPR = 4 };
4495
4657
};
4496
4658
4497
4659
class GroupBuilder {
@@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
4527
4689
// at the end. This also allows us to stop walking when we hit the
4528
4690
// first invalid item after it's all sorted.
4529
4691
//
4530
- // The "first" object goes first (closest to SP), followed by the members of
4531
- // the "first" group.
4692
+ // If we want to include a stack hazard region, order FPR accesses < the
4693
+ // hazard object < GPRs accesses in order to create a separation between the
4694
+ // two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
4695
+ //
4696
+ // Otherwise the "first" object goes first (closest to SP), followed by the
4697
+ // members of the "first" group.
4532
4698
//
4533
4699
// The rest are sorted by the group index to keep the groups together.
4534
4700
// Higher numbered groups are more likely to be around longer (i.e. untagged
@@ -4537,10 +4703,10 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
4537
4703
//
4538
4704
// If all else equal, sort by the object index to keep the objects in the
4539
4705
// original order.
4540
- return std::make_tuple (!A.IsValid , A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
4541
- A.ObjectIndex ) <
4542
- std::make_tuple (!B.IsValid , B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
4543
- B.ObjectIndex );
4706
+ return std::make_tuple (!A.IsValid , A.Accesses , A.ObjectFirst , A.GroupFirst ,
4707
+ A.GroupIndex , A. ObjectIndex ) <
4708
+ std::make_tuple (!B.IsValid , B.Accesses , B.ObjectFirst , B.GroupFirst ,
4709
+ B.GroupIndex , B. ObjectIndex );
4544
4710
}
4545
4711
} // namespace
4546
4712
@@ -4549,19 +4715,32 @@ void AArch64FrameLowering::orderFrameObjects(
4549
4715
if (!OrderFrameObjects || ObjectsToAllocate.empty ())
4550
4716
return ;
4551
4717
4718
+ const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
4552
4719
const MachineFrameInfo &MFI = MF.getFrameInfo ();
4553
4720
std::vector<FrameObject> FrameObjects (MFI.getObjectIndexEnd ());
4554
4721
for (auto &Obj : ObjectsToAllocate) {
4555
4722
FrameObjects[Obj].IsValid = true ;
4556
4723
FrameObjects[Obj].ObjectIndex = Obj;
4557
4724
}
4558
4725
4559
- // Identify stack slots that are tagged at the same time.
4726
+ // Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
4727
+ // the same time.
4560
4728
GroupBuilder GB (FrameObjects);
4561
4729
for (auto &MBB : MF) {
4562
4730
for (auto &MI : MBB) {
4563
4731
if (MI.isDebugInstr ())
4564
4732
continue ;
4733
+
4734
+ if (AFI.hasStackHazardSlotIndex ()) {
4735
+ std::optional<int > FI = getLdStFrameID (MI, MFI);
4736
+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
4737
+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
4738
+ FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
4739
+ else
4740
+ FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
4741
+ }
4742
+ }
4743
+
4565
4744
int OpIndex;
4566
4745
switch (MI.getOpcode ()) {
4567
4746
case AArch64::STGloop:
@@ -4600,11 +4779,20 @@ void AArch64FrameLowering::orderFrameObjects(
4600
4779
GB.EndCurrentGroup ();
4601
4780
}
4602
4781
4782
+ if (AFI.hasStackHazardSlotIndex ()) {
4783
+ FrameObjects[AFI.getStackHazardSlotIndex ()].Accesses =
4784
+ FrameObject::AccessHazard;
4785
+ // If a stack object is unknown or both GPR and FPR, sort it into GPR.
4786
+ for (auto &Obj : FrameObjects)
4787
+ if (!Obj.Accesses ||
4788
+ Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR))
4789
+ Obj.Accesses = FrameObject::AccessGPR;
4790
+ }
4791
+
4603
4792
// If the function's tagged base pointer is pinned to a stack slot, we want to
4604
4793
// put that slot first when possible. This will likely place it at SP + 0,
4605
4794
// and save one instruction when generating the base pointer because IRG does
4606
4795
// not allow an immediate offset.
4607
- const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
4608
4796
std::optional<int > TBPI = AFI.getTaggedBasePointerIndex ();
4609
4797
if (TBPI) {
4610
4798
FrameObjects[*TBPI].ObjectFirst = true ;
0 commit comments