Skip to content

[llvm-exegesis] [AArch64] Add support for Load Instructions in subprocess execution mode #144895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,6 @@ RUN: llvm-objdump -d %d > %t.s
RUN: FileCheck %s --check-prefix=FPCR-ASM < %t.s
FPCR-ASM: <foo>:
FPCR-ASM: movi d{{[0-9]+}}, #0000000000000000
FPCR-ASM-NEXT: mov x8, #0x0
FPCR-ASM-NEXT: msr FPCR, x8
FPCR-ASM-NEXT: mov x16, #0x0
FPCR-ASM-NEXT: msr FPCR, x16
FPCR-ASM-NEXT: bfcvt h{{[0-9]+}}, s{{[0-9]+}}
283 changes: 282 additions & 1 deletion llvm/tools/llvm-exegesis/lib/AArch64/Target.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,26 @@
//
//===----------------------------------------------------------------------===//
#include "../Target.h"
#include "../Error.h"
#include "../MmapUtils.h"
#include "../SerialSnippetGenerator.h"
#include "../SnippetGenerator.h"
#include "../SubprocessMemory.h"
#include "AArch64.h"
#include "AArch64RegisterInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCRegisterInfo.h"
#include <vector>
#define DEBUG_TYPE "exegesis-aarch64-target"

#if defined(__aarch64__) && defined(__linux__)
#include <sys/mman.h>
#include <sys/syscall.h>
#include <unistd.h> // for getpagesize()
#ifdef HAVE_LIBPFM
#include <perfmon/perf_event.h>
#endif // HAVE_LIBPFM
#include <linux/prctl.h> // For PR_PAC_* constants
#include <sys/prctl.h>
#ifndef PR_PAC_APIAKEY
Expand Down Expand Up @@ -73,7 +89,7 @@ static MCInst loadPPRImmediate(MCRegister Reg, unsigned RegBitWidth,
// Generates instructions to load an immediate value into an FPCR register.
static std::vector<MCInst>
loadFPCRImmediate(MCRegister Reg, unsigned RegBitWidth, const APInt &Value) {
MCRegister TempReg = AArch64::X8;
MCRegister TempReg = AArch64::X16;
MCInst LoadImm = MCInstBuilder(AArch64::MOVi64imm).addReg(TempReg).addImm(0);
MCInst MoveToFPCR =
MCInstBuilder(AArch64::MSR).addImm(AArch64SysReg::FPCR).addReg(TempReg);
Expand Down Expand Up @@ -106,6 +122,90 @@ static MCInst loadFPImmediate(MCRegister Reg, unsigned RegBitWidth,
return Instructions;
}

static void generateRegisterStackPush(unsigned int RegToPush,
std::vector<MCInst> &GeneratedCode,
int imm = -16) {
// STR [X|W]t, [SP, #simm]!: SP is decremented by default 16 bytes
// before the store to maintain 16-bytes alignment.
if (AArch64::GPR64RegClass.contains(RegToPush))
GeneratedCode.push_back(MCInstBuilder(AArch64::STRXpre)
.addReg(AArch64::SP)
.addReg(RegToPush)
.addReg(AArch64::SP)
.addImm(imm));
else if (AArch64::GPR32RegClass.contains(RegToPush))
GeneratedCode.push_back(MCInstBuilder(AArch64::STRWpre)
.addReg(AArch64::SP)
.addReg(RegToPush)
.addReg(AArch64::SP)
.addImm(imm));
else
llvm_unreachable("Unsupported register class for stack push");
}

static void generateRegisterStackPop(unsigned int RegToPopTo,
std::vector<MCInst> &GeneratedCode,
int imm = 16) {
// LDR Xt, [SP], #simm: SP is incremented by default 16 bytes after the load.
if (AArch64::GPR64RegClass.contains(RegToPopTo))
GeneratedCode.push_back(MCInstBuilder(AArch64::LDRXpost)
.addReg(AArch64::SP)
.addReg(RegToPopTo)
.addReg(AArch64::SP)
.addImm(imm));
else if (AArch64::GPR32RegClass.contains(RegToPopTo))
GeneratedCode.push_back(MCInstBuilder(AArch64::LDRWpost)
.addReg(AArch64::SP)
.addReg(RegToPopTo)
.addReg(AArch64::SP)
.addImm(imm));
else
llvm_unreachable("Unsupported register class for stack pop");
}

void generateSysCall(long SyscallNumber, std::vector<MCInst> &GeneratedCode) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some comments here about the mechanics of a syscall would be nice, i.e. X8 is the register where the syscall is expected etc, and that the SVC call.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments explaining role of X8, X0-X5, SVC #0 and return convention
Thanks!

// AArch64 Linux follows the AAPCS (ARM Architecture Procedure Call Standard):
// - X8 register contains the system call number
// - X0-X5 registers contain the first 6 arguments (if any)
// - SVC #0 instruction triggers the system call
// - Return value is placed in X0 register
GeneratedCode.push_back(
loadImmediate(AArch64::X8, 64, APInt(64, SyscallNumber)));
GeneratedCode.push_back(MCInstBuilder(AArch64::SVC).addImm(0));
}

/// Functions to save/restore system call registers
#ifdef __linux__
constexpr std::array<unsigned, 8> SyscallArgumentRegisters{
AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7,
};

static void saveSysCallRegisters(std::vector<MCInst> &GeneratedCode,
unsigned ArgumentCount) {
// AArch64 follows the AAPCS (ARM Architecture Procedure Call Standard):
// X0-X7 registers contain the first 8 arguments.
assert(ArgumentCount <= 8 &&
"This implementation saves up to 8 argument registers (X0-X7)");
// Preserve X8 (used for the syscall number/return value).
generateRegisterStackPush(AArch64::X8, GeneratedCode);
// Preserve the registers used to pass arguments to the system call.
for (unsigned I = 0; I < ArgumentCount; ++I) {
generateRegisterStackPush(SyscallArgumentRegisters[I], GeneratedCode);
}
}

static void restoreSysCallRegisters(std::vector<MCInst> &GeneratedCode,
unsigned ArgumentCount) {
assert(ArgumentCount <= 8 &&
"This implementation restores up to 8 argument registers (X0-X7)");
// Restore registers in reverse order
for (int I = ArgumentCount - 1; I >= 0; --I) {
generateRegisterStackPop(SyscallArgumentRegisters[I], GeneratedCode);
}
generateRegisterStackPop(AArch64::X8, GeneratedCode);
}
#endif // __linux__
#include "AArch64GenExegesis.inc"

namespace {
Expand All @@ -115,7 +215,39 @@ class ExegesisAArch64Target : public ExegesisTarget {
ExegesisAArch64Target()
: ExegesisTarget(AArch64CpuPfmCounters, AArch64_MC::isOpcodeAvailable) {}

enum ReservedRegisters {
CodeSize = AArch64::X12,
AuxiliaryMemoryFD = AArch64::X13,
TempRegister = AArch64::X16,
};

std::vector<MCInst> _generateRegisterStackPop(MCRegister Reg,
int imm = 0) const override {
std::vector<MCInst> Insts;
if (AArch64::GPR32RegClass.contains(Reg) ||
AArch64::GPR64RegClass.contains(Reg)) {
generateRegisterStackPop(Reg, Insts, imm);
return Insts;
}
return {};
}

private:
#ifdef __linux__
std::vector<MCInst> generateExitSyscall(unsigned ExitCode) const override;
std::vector<MCInst>
generateMmap(uintptr_t Address, size_t Length,
uintptr_t FileDescriptorAddress) const override;
void generateMmapAuxMem(std::vector<MCInst> &GeneratedCode) const override;
std::vector<MCInst> generateMemoryInitialSetup() const override;
std::vector<MCInst> setStackRegisterToAuxMem() const override;
uintptr_t getAuxiliaryMemoryStartAddress() const override;
std::vector<MCInst> configurePerfCounter(long Request,
bool SaveRegisters) const override;
std::vector<MCRegister> getArgumentRegisters() const override;
std::vector<MCRegister> getRegistersNeedSaving() const override;
#endif // __linux__

std::vector<MCInst> setRegTo(const MCSubtargetInfo &STI, MCRegister Reg,
const APInt &Value) const override {
if (AArch64::GPR32RegClass.contains(Reg))
Expand Down Expand Up @@ -155,6 +287,155 @@ class ExegesisAArch64Target : public ExegesisTarget {

} // namespace

#ifdef __linux__
// true : let use of fixed address to Virtual Address Space Ceiling
// false: let kernel choose the address of the auxiliary memory
bool UseFixedAddress = true;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the first argument to the mmap syscall isn't it? If the address is 0, i.e. not specified, then MMAP returns an address? This could be clarified. But either way, why do we only use the fixed addresses? And is there any value in having this boolean here? Do we need it, or is it clearer to just get rid of it if we only use the fixed addresses anyway?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mmap always returns an address. When you pass an address, there is no guarantee that will actually be the address of the mapping. The actual address of the mapping (or an error code) gets returned.

Exegesis currently only has support for mappings at a fixed address.

Given this flag is always set to true, we should just get rid of it.


static constexpr const uintptr_t VAddressSpaceCeiling = 0x0000800000000000;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not that familiar with the memory map. For folks with the same problem, can you comment and document where this number comes from?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look at opening up a patch to document this. It essentially comes from https://www.kernel.org/doc/html/v5.8/x86/x86_64/mm.html which ends up being a function of how the MMU is setup.


static void generateRoundToNearestPage(unsigned int TargetRegister,
std::vector<MCInst> &GeneratedCode) {
int PageSizeShift = static_cast<int>(round(log2(getpagesize())));
// Round down to the nearest page by getting rid of the least significant bits
// representing location in the page.

// Single instruction using AND with inverted mask (effectively BIC)
uint64_t BitsToClearMask = (1ULL << PageSizeShift) - 1; // 0xFFF
uint64_t AndMask = ~BitsToClearMask; // ...FFFFFFFFFFFF000
GeneratedCode.push_back(MCInstBuilder(AArch64::ANDXri)
.addReg(TargetRegister) // Xd
.addReg(TargetRegister) // Xn
.addImm(AndMask) // imm bitmask
);
}

std::vector<MCInst>
ExegesisAArch64Target::generateExitSyscall(unsigned ExitCode) const {
std::vector<MCInst> ExitCallCode;
ExitCallCode.push_back(loadImmediate(AArch64::X0, 64, APInt(64, ExitCode)));
generateSysCall(SYS_exit, ExitCallCode); // SYS_exit is 93
return ExitCallCode;
}

std::vector<MCInst>
ExegesisAArch64Target::generateMmap(uintptr_t Address, size_t Length,
uintptr_t FileDescriptorAddress) const {
// mmap(address, length, prot, flags, fd, offset=0)
int flags = MAP_SHARED;
if (Address != 0) {
flags |= MAP_FIXED_NOREPLACE;
}
std::vector<MCInst> MmapCode;
MmapCode.push_back(
loadImmediate(AArch64::X0, 64, APInt(64, Address))); // map adr
MmapCode.push_back(
loadImmediate(AArch64::X1, 64, APInt(64, Length))); // length
MmapCode.push_back(loadImmediate(AArch64::X2, 64,
APInt(64, PROT_READ | PROT_WRITE))); // prot
MmapCode.push_back(loadImmediate(AArch64::X3, 64, APInt(64, flags))); // flags
// FIXME: File descriptor address is not initialized.
// Copy file descriptor location from aux memory into X4
MmapCode.push_back(
loadImmediate(AArch64::X4, 64, APInt(64, FileDescriptorAddress))); // fd
MmapCode.push_back(loadImmediate(AArch64::X5, 64, APInt(64, 0))); // offset
generateSysCall(SYS_mmap, MmapCode); // SYS_mmap is 222
return MmapCode;
}

void ExegesisAArch64Target::generateMmapAuxMem(
std::vector<MCInst> &GeneratedCode) const {
int fd = -1;
int flags = MAP_SHARED;
uintptr_t address = getAuxiliaryMemoryStartAddress();
if (fd == -1)
flags |= MAP_ANONYMOUS;
if (address != 0)
flags |= MAP_FIXED_NOREPLACE;
int prot = PROT_READ | PROT_WRITE;

GeneratedCode.push_back(
loadImmediate(AArch64::X0, 64, APInt(64, address))); // map adr
GeneratedCode.push_back(loadImmediate(
AArch64::X1, 64,
APInt(64, SubprocessMemory::AuxiliaryMemorySize))); // length
GeneratedCode.push_back(
loadImmediate(AArch64::X2, 64, APInt(64, prot))); // prot
GeneratedCode.push_back(
loadImmediate(AArch64::X3, 64, APInt(64, flags))); // flags
GeneratedCode.push_back(loadImmediate(AArch64::X4, 64, APInt(64, fd))); // fd
GeneratedCode.push_back(
loadImmediate(AArch64::X5, 64, APInt(64, 0))); // offset
generateSysCall(SYS_mmap, GeneratedCode); // SYS_mmap is 222
}

std::vector<MCInst> ExegesisAArch64Target::generateMemoryInitialSetup() const {
std::vector<MCInst> MemoryInitialSetupCode;
generateMmapAuxMem(MemoryInitialSetupCode); // FIXME: Uninit file descriptor

// If using fixed address for auxiliary memory skip this step,
// When using dynamic memory allocation (non-fixed address), we must preserve
// the mmap return value (X0) which contains the allocated memory address.
// This value is saved to the stack to ensure registers requiring memory
// access can retrieve the correct address even if X0 is modified by
// intermediate code.
generateRegisterStackPush(AArch64::X0, MemoryInitialSetupCode);
// FIXME: Ensure stack pointer remains stable to prevent loss of saved address
return MemoryInitialSetupCode;
}

std::vector<MCInst> ExegesisAArch64Target::setStackRegisterToAuxMem() const {
std::vector<MCInst> instructions; // NOP
// Motivation unclear, found no need for this in AArch64.
// TODO: Implement this, if required.
dbgs() << "Warning: setStackRegisterToAuxMem called but not required for "
"AArch64\n";
return instructions;
}

uintptr_t ExegesisAArch64Target::getAuxiliaryMemoryStartAddress() const {
if (!UseFixedAddress)
// Allow kernel to select an appropriate memory address
return 0;
// Return the second to last page in the virtual address space
// to try and prevent interference with memory annotations in the snippet
// VAddressSpaceCeiling = 0x0000800000000000
// FIXME: Why 2 pages?
return VAddressSpaceCeiling - (2 * getpagesize());
}

std::vector<MCInst>
ExegesisAArch64Target::configurePerfCounter(long Request,
bool SaveRegisters) const {
std::vector<MCInst> ConfigurePerfCounterCode; // NOP
// FIXME: SYSCALL exits with EBADF error - file descriptor is invalid
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's unclear to me what the consequences are of this? Does not work at all? Does it mean all measurements are unreliable? Please clarify.

Copy link
Contributor Author

@lakshayk-nv lakshayk-nv Jul 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is unclear to me where to get file descriptor's address and using fd=-1 (as used when mmap called for aux memory) returns EBADF error in X8 register after syscall.
Function configurePerfCounter should set up performance monitoring perf_event_open.
To not include subprocess's setup code in benchmarking the instruction.

Yes, Currently all measurements are unreliable in subprocess mode for AArch64.
PS: Added warning for unimplemented configurePerfCounter and resultant unreliable measurements

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance counters do not work at all if these syscalls do not work. This probably needs to be fixed before this lands.

// No file is opened previosly to add as file descriptor
dbgs() << "Warning: configurePerfCounter not implemented, measurements will "
"be unreliable\n";
return ConfigurePerfCounterCode;
}

std::vector<MCRegister> ExegesisAArch64Target::getArgumentRegisters() const {
return {AArch64::X0, AArch64::X1};
}

std::vector<MCRegister> ExegesisAArch64Target::getRegistersNeedSaving() const {
return {
AArch64::X0,
AArch64::X1,
AArch64::X2,
AArch64::X3,
AArch64::X4,
AArch64::X5,
AArch64::X8,
ReservedRegisters::TempRegister,
ReservedRegisters::CodeSize,
ReservedRegisters::AuxiliaryMemoryFD,
};
}

#endif // __linux__

static ExegesisTarget *getTheExegesisAArch64Target() {
static ExegesisAArch64Target Target;
return &Target;
Expand Down
34 changes: 34 additions & 0 deletions llvm/tools/llvm-exegesis/lib/Assembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
assert(MM.Address % getpagesize() == 0 &&
"Memory mappings need to be aligned to page boundaries.");
#endif
// FIXME: file descriptor for aux memory seems not initialized.
// TODO: Invoke openat syscall to get correct fd for aux memory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would this need openat instead of open?

const MemoryValue &MemVal = Key.MemoryValues.at(MM.MemoryValueName);
BBF.addInstructions(ET.generateMmap(
MM.Address, MemVal.SizeBytes,
Expand All @@ -78,15 +80,47 @@ static bool generateSnippetSetupCode(const ExegesisTarget &ET,
Register StackPointerRegister = BBF.MF.getSubtarget()
.getTargetLowering()
->getStackPointerRegisterToSaveRestore();
#define DEBUG_TYPE "register-initial-values"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding new debug info here should probably be split out to a separate patch.

// FIXME: Only loading first register with memory address is hacky.
bool isFirstRegister = true;
for (const RegisterValue &RV : Key.RegisterInitialValues) {
// Debug: register name and class name and value from BenchmarkKey
const MCRegisterInfo *RegInfo = BBF.MF.getTarget().getMCRegisterInfo();
const char *RegName = RegInfo->getName(RV.Register);
const char *regClassName = "Unknown";
for (unsigned i = 0, e = RegInfo->getNumRegClasses(); i < e; ++i) {
const MCRegisterClass &RC = RegInfo->getRegClass(i);
if (RC.contains(RV.Register)) {
regClassName = RegInfo->getRegClassName(&RC);
break;
}
}
LLVM_DEBUG(
dbgs() << "Setting register (Class: " << regClassName << ") " << RegName
<< std::string(
std::max(0, 3 - static_cast<int>(strlen(RegName))), ' '));

if (GenerateMemoryInstructions) {
// If we're generating memory instructions, don't load in the value for
// the register with the stack pointer as it will be used later to finish
// the setup.
if (Register(RV.Register) == StackPointerRegister)
continue;
#if defined(__aarch64__)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think a preprocessor directive here is correct. Exegesis should be able to generate snippets for non-native platforms.

auto StackLoadInsts = ET._generateRegisterStackPop(RV.Register, 16);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this only need to be done on AArch64?

if (!StackLoadInsts.empty() && isFirstRegister) {
for (const auto &Inst : StackLoadInsts)
BBF.addInstruction(Inst);
isFirstRegister = false;
LLVM_DEBUG(dbgs() << "from stack with post-increment offset of " << 16
<< " bytes\n");
continue;
}
#endif
}
// Load a constant in the register.
LLVM_DEBUG(dbgs() << " to " << RV.Value << "\n");
#undef DEBUG_TYPE
const auto SetRegisterCode = ET.setRegTo(*MSI, RV.Register, RV.Value);
if (SetRegisterCode.empty())
IsSnippetSetupComplete = false;
Expand Down
Loading
Loading