Skip to content

[AArch64] Add streaming-mode stack hazard optimization remarks #101695

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/include/llvm/CodeGen/TargetFrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "llvm/ADT/BitVector.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/Support/TypeSize.h"
#include <vector>

Expand Down Expand Up @@ -473,6 +474,11 @@ class TargetFrameLowering {
/// Return the frame base information to be encoded in the DWARF subprogram
/// debug info.
virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const;

/// This method is called at the end of prolog/epilog code insertion, so
/// targets can emit remarks based on the final frame layout.
virtual void emitRemarks(const MachineFunction &MF,
MachineOptimizationRemarkEmitter *ORE) const {};
};

} // End llvm namespace
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/PrologEpilogInserter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
<< ore::NV("Function", MF.getFunction().getName()) << "'";
});

// Emit any remarks implemented for the target, based on final frame layout.
TFI->emitRemarks(MF, ORE);

delete RS;
SaveBlocks.clear();
RestoreBlocks.clear();
Expand Down
204 changes: 196 additions & 8 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
Expand Down Expand Up @@ -275,6 +276,10 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
// Stack hazard padding size. 0 = disabled.
static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size",
cl::init(0), cl::Hidden);
// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
static cl::opt<unsigned>
StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
cl::Hidden);
// Whether to insert padding into non-streaming functions (for testing).
static cl::opt<bool>
StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
Expand Down Expand Up @@ -2615,9 +2620,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF,
const auto &MFI = MF.getFrameInfo();

int64_t ObjectOffset = MFI.getObjectOffset(FI);
StackOffset SVEStackSize = getSVEStackSize(MF);

// For VLA-area objects, just emit an offset at the end of the stack frame.
// Whilst not quite correct, these objects do live at the end of the frame and
// so it is more useful for analysis for the offset to reflect this.
if (MFI.isVariableSizedObjectIndex(FI)) {
return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
}

// This is correct in the absence of any SVE stack objects.
StackOffset SVEStackSize = getSVEStackSize(MF);
if (!SVEStackSize)
return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());

Expand Down Expand Up @@ -3528,13 +3540,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return true;
}

// Return the FrameID for a Load/Store instruction by looking at the MMO.
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
const MachineFrameInfo &MFI) {
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
return std::nullopt;

MachineMemOperand *MMO = *MI.memoperands_begin();
// Return the FrameID for a MMO.
static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
const MachineFrameInfo &MFI) {
auto *PSV =
dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
if (PSV)
Expand All @@ -3552,6 +3560,15 @@ static std::optional<int> getLdStFrameID(const MachineInstr &MI,
return std::nullopt;
}

// Return the FrameID for a Load/Store instruction by looking at the first MMO.
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
const MachineFrameInfo &MFI) {
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
return std::nullopt;

return getMMOFrameID(*MI.memoperands_begin(), MFI);
}

// Check if a Hazard slot is needed for the current function, and if so create
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
// which can be used to determine if any hazard padding is needed.
Expand Down Expand Up @@ -5029,3 +5046,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF,
MI->eraseFromParent();
}
}

struct StackAccess {
enum AccessType {
NotAccessed = 0, // Stack object not accessed by load/store instructions.
GPR = 1 << 0, // A general purpose register.
PPR = 1 << 1, // A predicate register.
FPR = 1 << 2, // A floating point/Neon/SVE register.
};

int Idx;
StackOffset Offset;
int64_t Size;
unsigned AccessTypes;

StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {}

bool operator<(const StackAccess &Rhs) const {
return std::make_tuple(start(), Idx) <
std::make_tuple(Rhs.start(), Rhs.Idx);
}

bool isCPU() const {
// Predicate register load and store instructions execute on the CPU.
return AccessTypes & (AccessType::GPR | AccessType::PPR);
}
bool isSME() const { return AccessTypes & AccessType::FPR; }
bool isMixed() const { return isCPU() && isSME(); }

int64_t start() const { return Offset.getFixed() + Offset.getScalable(); }
int64_t end() const { return start() + Size; }

std::string getTypeString() const {
switch (AccessTypes) {
case AccessType::FPR:
return "FPR";
case AccessType::PPR:
return "PPR";
case AccessType::GPR:
return "GPR";
case AccessType::NotAccessed:
return "NA";
default:
return "Mixed";
}
}

void print(raw_ostream &OS) const {
OS << getTypeString() << " stack object at [SP"
<< (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed();
if (Offset.getScalable())
OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable()
<< " * vscale";
OS << "]";
}
};

static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) {
SA.print(OS);
return OS;
}

void AArch64FrameLowering::emitRemarks(
const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const {

SMEAttrs Attrs(MF.getFunction());
if (Attrs.hasNonStreamingInterfaceAndBody())
return;

const uint64_t HazardSize =
(StackHazardSize) ? StackHazardSize : StackHazardRemarkSize;

if (HazardSize == 0)
return;

const MachineFrameInfo &MFI = MF.getFrameInfo();
// Bail if function has no stack objects.
if (!MFI.hasStackObjects())
return;

std::vector<StackAccess> StackAccesses(MFI.getNumObjects());
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what happens if MFI.getNumObjects()==0, but it might be worth bailing out early.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - Done.


size_t NumFPLdSt = 0;
size_t NumNonFPLdSt = 0;

// Collect stack accesses via Load/Store instructions.
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
continue;
for (MachineMemOperand *MMO : MI.memoperands()) {
std::optional<int> FI = getMMOFrameID(MMO, MFI);
if (FI && !MFI.isDeadObjectIndex(*FI)) {
int FrameIdx = *FI;

size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects();
if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) {
StackAccesses[ArrIdx].Idx = FrameIdx;
StackAccesses[ArrIdx].Offset =
getFrameIndexReferenceFromSP(MF, FrameIdx);
StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx);
}

unsigned RegTy = StackAccess::AccessType::GPR;
if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) {
if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg()))
RegTy = StackAccess::PPR;
else
RegTy = StackAccess::FPR;
} else if (AArch64InstrInfo::isFpOrNEON(MI)) {
RegTy = StackAccess::FPR;
}

StackAccesses[ArrIdx].AccessTypes |= RegTy;

if (RegTy == StackAccess::FPR)
++NumFPLdSt;
else
++NumNonFPLdSt;
}
}
}
}

if (NumFPLdSt == 0 || NumNonFPLdSt == 0)
return;

llvm::sort(StackAccesses);
StackAccesses.erase(llvm::remove_if(StackAccesses,
[](const StackAccess &S) {
return S.AccessTypes ==
StackAccess::NotAccessed;
}),
StackAccesses.end());

SmallVector<const StackAccess *> MixedObjects;
SmallVector<std::pair<const StackAccess *, const StackAccess *>> HazardPairs;

if (StackAccesses.front().isMixed())
MixedObjects.push_back(&StackAccesses.front());

for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end());
It != End; ++It) {
const auto &First = *It;
const auto &Second = *(It + 1);

if (Second.isMixed())
MixedObjects.push_back(&Second);

if ((First.isSME() && Second.isCPU()) ||
(First.isCPU() && Second.isSME())) {
uint64_t Distance = static_cast<uint64_t>(Second.start() - First.end());
if (Distance < HazardSize)
HazardPairs.emplace_back(&First, &Second);
}
}

auto EmitRemark = [&](llvm::StringRef Str) {
ORE->emit([&]() {
auto R = MachineOptimizationRemarkAnalysis(
"sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front());
return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str;
});
};

for (const auto &P : HazardPairs)
EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str());

for (const auto *Obj : MixedObjects)
EmitRemark(
formatv("{0} accessed by both GP and FP instructions", *Obj).str());
}
6 changes: 5 additions & 1 deletion llvm/lib/Target/AArch64/AArch64FrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H
#define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H

#include "llvm/Support/TypeSize.h"
#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/Support/TypeSize.h"

namespace llvm {

Expand Down Expand Up @@ -178,6 +179,9 @@ class AArch64FrameLowering : public TargetFrameLowering {
inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI,
int64_t NegProbeSize,
Register TargetReg) const;

void emitRemarks(const MachineFunction &MF,
MachineOptimizationRemarkEmitter *ORE) const override;
};

} // End llvm namespace
Expand Down
Loading
Loading