Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BOLT][AArch64] Add support for compact code model #112110

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bolt/include/bolt/Core/BinaryBasicBlock.h
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,9 @@ class BinaryBasicBlock {
return OutputAddressRange;
}

uint64_t getOutputStartAddress() const { return OutputAddressRange.first; }
uint64_t getOutputEndAddress() const { return OutputAddressRange.second; }

bool hasLocSyms() const { return LocSyms != nullptr; }

/// Return mapping of input offsets to symbols in the output.
Expand Down
3 changes: 2 additions & 1 deletion bolt/include/bolt/Core/FunctionLayout.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ class FunctionFragment {
const_iterator begin() const;
iterator end();
const_iterator end() const;
const BinaryBasicBlock *front() const;
BinaryBasicBlock *front() const;
BinaryBasicBlock *back() const;

friend class FunctionLayout;
};
Expand Down
13 changes: 13 additions & 0 deletions bolt/include/bolt/Passes/LongJmp.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,19 @@ class LongJmpPass : public BinaryFunctionPass {
uint32_t NumColdStubs{0};
uint32_t NumSharedStubs{0};

/// The shortest distance for any branch instruction on AArch64.
static constexpr size_t ShortestJumpBits = 16;
static constexpr size_t ShortestJumpSpan = 1ULL << (ShortestJumpBits - 1);

/// The longest single-instruction branch.
static constexpr size_t LongestJumpBits = 28;
static constexpr size_t LongestJumpSpan = 1ULL << (LongestJumpBits - 1);

/// Relax all internal function branches including those between fragments.
/// Assume that fragments are placed in different sections but are within
/// 128MB of each other.
void relaxLocalBranches(BinaryFunction &BF);

/// -- Layout estimation methods --
/// Try to do layout before running the emitter, by looking at BinaryFunctions
/// and MCInsts -- this is an estimation. To be correct for longjmp inserter
Expand Down
4 changes: 3 additions & 1 deletion bolt/lib/Core/FunctionLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ FunctionFragment::const_iterator FunctionFragment::end() const {
return const_iterator(Layout->block_begin() + StartIndex + Size);
}

const BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }
BinaryBasicBlock *FunctionFragment::front() const { return *begin(); }

BinaryBasicBlock *FunctionFragment::back() const { return *std::prev(end()); }

FunctionLayout::FunctionLayout() { addFragment(); }

Expand Down
278 changes: 275 additions & 3 deletions bolt/lib/Passes/LongJmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,26 @@
//===----------------------------------------------------------------------===//

#include "bolt/Passes/LongJmp.h"
#include "bolt/Core/ParallelUtilities.h"
#include "llvm/Support/MathExtras.h"

#define DEBUG_TYPE "longjmp"

using namespace llvm;

namespace opts {
extern cl::OptionCategory BoltCategory;
extern cl::OptionCategory BoltOptCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern cl::opt<unsigned> AlignFunctions;
extern cl::opt<bool> UseOldText;
extern cl::opt<bool> HotFunctionsAtEnd;

static cl::opt<bool>
CompactCodeModel("compact-code-model",
cl::desc("generate code for binaries <128MB on AArch64"),
cl::init(false), cl::cat(BoltCategory));

static cl::opt<bool> GroupStubs("group-stubs",
cl::desc("share stubs across functions"),
cl::init(true), cl::cat(BoltOptCategory));
Expand Down Expand Up @@ -61,10 +69,10 @@ static BinaryBasicBlock *getBBAtHotColdSplitPoint(BinaryFunction &Func) {
if (Next != E && (*Next)->isCold())
return *I;
}
llvm_unreachable("No hot-colt split point found");
llvm_unreachable("No hot-cold split point found");
}

static bool shouldInsertStub(const BinaryContext &BC, const MCInst &Inst) {
static bool mayNeedStub(const BinaryContext &BC, const MCInst &Inst) {
return (BC.MIB->isBranch(Inst) || BC.MIB->isCall(Inst)) &&
!BC.MIB->isIndirectBranch(Inst) && !BC.MIB->isIndirectCall(Inst);
}
Expand Down Expand Up @@ -565,7 +573,7 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
if (BC.MIB->isPseudo(Inst))
continue;

if (!shouldInsertStub(BC, Inst)) {
if (!mayNeedStub(BC, Inst)) {
DotAddress += InsnSize;
continue;
}
Expand Down Expand Up @@ -629,7 +637,271 @@ Error LongJmpPass::relax(BinaryFunction &Func, bool &Modified) {
return Error::success();
}

void LongJmpPass::relaxLocalBranches(BinaryFunction &BF) {
BinaryContext &BC = BF.getBinaryContext();
auto &MIB = BC.MIB;

if (!BF.isSimple())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also covered by the SkipPredicate, right?

return;

// Quick path.
if (!BF.isSplit() && BF.estimateSize() < ShortestJumpSpan)
return;

auto isBranchOffsetInRange = [&](const MCInst &Inst, int64_t Offset) {
const unsigned Bits = MIB->getPCRelEncodingSize(Inst);
return isIntN(Bits, Offset);
};

auto isBlockInRange = [&](const MCInst &Inst, uint64_t InstAddress,
const BinaryBasicBlock &BB) {
const int64_t Offset = BB.getOutputStartAddress() - InstAddress;
return isBranchOffsetInRange(Inst, Offset);
};

// Keep track of *all* function trampolines that are going to be added to the
// function layout at the end of relaxation.
std::vector<std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>>>
FunctionTrampolines;

// Function fragments are relaxed independently.
for (FunctionFragment &FF : BF.getLayout().fragments()) {
// Fill out code size estimation for the fragment. Use output BB address
// ranges to store offsets from the start of the function.
uint64_t CodeSize = 0;
for (BinaryBasicBlock *BB : FF) {
BB->setOutputStartAddress(CodeSize);
CodeSize += BB->estimateSize();
BB->setOutputEndAddress(CodeSize);
}

// Dynamically-updated size of the fragment.
uint64_t FragmentSize = CodeSize;

// Size of the trampoline in bytes.
constexpr uint64_t TrampolineSize = 4;

// Trampolines created for the fragment. DestinationBB -> TrampolineBB.
// NB: here we store only the first trampoline created for DestinationBB.
DenseMap<const BinaryBasicBlock *, BinaryBasicBlock *> FragmentTrampolines;

// Create a trampoline code after \p BB or at the end of the fragment if BB
// is nullptr.
auto addTrampolineAfter = [&](BinaryBasicBlock *BB,
BinaryBasicBlock *TargetBB, uint64_t Count,
Copy link
Member

@paschalis-mpeis paschalis-mpeis Oct 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(not so sure on this, so consider it as me 'thinking out loud')

Would it make sense to add a check on whether TargetBB is in range?
Given that this is for binaries <128MB, I assume it'll be in range.

But could there be a borderline case where TargetBB was initially in range but relaxations in between have shifted it right outside of range? If that's a possibility, then we would expect the relaxation loop to eventually get it right in the future? or is this part of the bits that will be 'offloaded' to the JIT linker?

bool UpdateOffsets = true) {
std::unique_ptr<BinaryBasicBlock> TrampolineBB = BF.createBasicBlock();
MCInst Inst;
{
auto L = BC.scopeLock();
MIB->createUncondBranch(Inst, TargetBB->getLabel(), BC.Ctx.get());
}
TrampolineBB->addInstruction(Inst);
TrampolineBB->addSuccessor(TargetBB, Count);
TrampolineBB->setExecutionCount(Count);
const uint64_t TrampolineAddress =
BB ? BB->getOutputEndAddress() : FragmentSize;
TrampolineBB->setOutputStartAddress(TrampolineAddress);
TrampolineBB->setOutputEndAddress(TrampolineAddress + TrampolineSize);
TrampolineBB->setFragmentNum(FF.getFragmentNum());

if (UpdateOffsets) {
FragmentSize += TrampolineSize;
for (BinaryBasicBlock *IBB : FF) {
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
TrampolineSize);
}
}
for (auto &Pair : FunctionTrampolines) {
BinaryBasicBlock *IBB = Pair.second.get();
if (IBB->getFragmentNum() != TrampolineBB->getFragmentNum())
continue;
if (IBB == TrampolineBB.get())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if I'm reading code correctly, but this could go away if:

  1. we get rid the earlier loop (L710) and
  2. we move FragmentTrampolines insertion (L733) before the loop (if no other side-effects)?

continue;
if (IBB->getOutputStartAddress() >= TrampolineAddress) {
IBB->setOutputStartAddress(IBB->getOutputStartAddress() +
TrampolineSize);
IBB->setOutputEndAddress(IBB->getOutputEndAddress() +
TrampolineSize);
}
}
}

if (!FragmentTrampolines.lookup(TargetBB))
FragmentTrampolines[TargetBB] = TrampolineBB.get();
FunctionTrampolines.emplace_back(BB ? BB : FF.back(),
std::move(TrampolineBB));

return FunctionTrampolines.back().second.get();
};

// Pre-populate trampolines by splitting unconditional branches from the
// containing basic block.
for (BinaryBasicBlock *BB : FF) {
MCInst *Inst = BB->getLastNonPseudoInstr();
if (!Inst || !MIB->isUnconditionalBranch(*Inst))
continue;

const MCSymbol *TargetSymbol = MIB->getTargetSymbol(*Inst);
BB->eraseInstruction(BB->findInstruction(Inst));
BB->setOutputEndAddress(BB->getOutputEndAddress() - TrampolineSize);

BinaryBasicBlock::BinaryBranchInfo BI;
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol, BI);

BinaryBasicBlock *TrampolineBB =
addTrampolineAfter(BB, TargetBB, BI.Count, /*UpdateOffsets*/ false);
BB->replaceSuccessor(TargetBB, TrampolineBB, BI.Count);
}

/// Relax the branch \p Inst. Return true if basic block offsets need an
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the Return true part of the comment still active. It looks like this lambda doesn't return a value, and it looks like addTrampolineAfter handles the update of offsets.

/// update after the trampoline insertion.
auto relaxBranch = [&](BinaryBasicBlock *BB, MCInst &Inst,
uint64_t InstAddress, BinaryBasicBlock *TargetBB) {
BinaryFunction *BF = BB->getParent();

// Use branch taken count for optimal relaxation.
const uint64_t Count = BB->getBranchInfo(*TargetBB).Count;
assert(Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
"Expected valid branch execution count");

// Try to reuse an existing trampoline without introducing any new code.
BinaryBasicBlock *TrampolineBB = FragmentTrampolines.lookup(TargetBB);
if (TrampolineBB && isBlockInRange(Inst, InstAddress, *TrampolineBB)) {
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
TrampolineBB->setExecutionCount(TrampolineBB->getExecutionCount() +
Count);
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
return;
}

// For cold branches, check if we can introduce a trampoline at the end
// of the fragment that is within the branch reach. Note that such
// trampoline may change address later and become unreachable in which
// case we will need further relaxation.
const int64_t OffsetToEnd = FragmentSize - InstAddress;
if (Count == 0 && isBranchOffsetInRange(Inst, OffsetToEnd)) {
TrampolineBB = addTrampolineAfter(nullptr, TargetBB, Count);
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());

return;
}

// Insert a new block after the current one and use it as a trampoline.
TrampolineBB = addTrampolineAfter(BB, TargetBB, Count);

// If the other successor is a fall-through, invert the condition code.
const BinaryBasicBlock *const NextBB =
BF->getLayout().getBasicBlockAfter(BB, /*IgnoreSplits*/ false);
if (BB->getConditionalSuccessor(false) == NextBB) {
BB->swapConditionalSuccessors();
auto L = BC.scopeLock();
MIB->reverseBranchCondition(Inst, NextBB->getLabel(), BC.Ctx.get());
} else {
auto L = BC.scopeLock();
MIB->replaceBranchTarget(Inst, TrampolineBB->getLabel(), BC.Ctx.get());
}
BB->replaceSuccessor(TargetBB, TrampolineBB, Count);
};

bool MayNeedRelaxation;
uint64_t NumIterations = 0;
do {
MayNeedRelaxation = false;
++NumIterations;
for (auto BBI = FF.begin(); BBI != FF.end(); ++BBI) {
BinaryBasicBlock *BB = *BBI;
uint64_t NextInstOffset = BB->getOutputStartAddress();
for (MCInst &Inst : *BB) {
const size_t InstAddress = NextInstOffset;
if (!MIB->isPseudo(Inst))
NextInstOffset += 4;

if (!mayNeedStub(BF.getBinaryContext(), Inst))
continue;

const size_t BitsAvailable = MIB->getPCRelEncodingSize(Inst);

// Span of +/-128MB.
if (BitsAvailable == LongestJumpBits)
continue;

const MCSymbol *TargetSymbol = MIB->getTargetSymbol(Inst);
BinaryBasicBlock *TargetBB = BB->getSuccessor(TargetSymbol);
assert(TargetBB &&
"Basic block target expected for conditional branch.");

// Check if the relaxation is needed.
if (TargetBB->getFragmentNum() == FF.getFragmentNum() &&
isBlockInRange(Inst, InstAddress, *TargetBB))
continue;

relaxBranch(BB, Inst, InstAddress, TargetBB);

MayNeedRelaxation = true;
}
}

// We may have added new instructions, but the whole fragment is less than
// the minimum branch span.
if (FragmentSize < ShortestJumpSpan)
MayNeedRelaxation = false;

} while (MayNeedRelaxation);

LLVM_DEBUG({
if (NumIterations > 2) {
dbgs() << "BOLT-DEBUG: relaxed fragment " << FF.getFragmentNum().get()
<< " of " << BF << " in " << NumIterations << " iterations\n";
}
});
}

// Add trampoline blocks from all fragments to the layout.
DenseMap<BinaryBasicBlock *, std::vector<std::unique_ptr<BinaryBasicBlock>>>
Insertions;
for (std::pair<BinaryBasicBlock *, std::unique_ptr<BinaryBasicBlock>> &Pair :
FunctionTrampolines) {
if (!Pair.second)
continue;
Insertions[Pair.first].emplace_back(std::move(Pair.second));
}

for (auto &Pair : Insertions) {
BF.insertBasicBlocks(Pair.first, std::move(Pair.second),
/*UpdateLayout*/ true, /*UpdateCFI*/ true,
/*RecomputeLPs*/ false);
}
}

Error LongJmpPass::runOnFunctions(BinaryContext &BC) {

if (opts::CompactCodeModel) {
BC.outs()
<< "BOLT-INFO: relaxing branches for compact code model (<128MB)\n";

ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
relaxLocalBranches(BF);
};

ParallelUtilities::PredicateTy SkipPredicate =
[&](const BinaryFunction &BF) {
return !BC.shouldEmit(BF) || !BF.isSimple();
};

ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
SkipPredicate, "RelaxLocalBranches");

return Error::success();
}

BC.outs() << "BOLT-INFO: Starting stub-insertion pass\n";
std::vector<BinaryFunction *> Sorted = BC.getSortedFunctions();
bool Modified;
Expand Down
Loading
Loading