Skip to content

[AMDGPU] Replace dynamic VGPR feature with attribute #133444

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -761,11 +761,6 @@ For example:
performant than code generated for XNACK replay
disabled.

dynamic-vgpr TODO Represents the "Dynamic VGPR" hardware mode, introduced in GFX12.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message.

=============== ============================ ==================================================

.. _amdgpu-target-id:
Expand Down Expand Up @@ -1753,6 +1748,15 @@ The AMDGPU backend supports the following LLVM IR attributes.

"amdgpu-promote-alloca-to-vector-vgpr-ratio" Ratio of VGPRs to budget for promoting alloca to vectors.

"amdgpu-dynamic-vgpr-block-size" Represents the size of a VGPR block in the "Dynamic VGPR" hardware mode,
introduced in GFX12.
A value of 0 (default) means that dynamic VGPRs are not enabled.
Valid values for GFX12+ are 16 and 32.
Waves launched in this mode may allocate or deallocate the VGPRs
using dedicated instructions, but may not send the DEALLOC_VGPRS
message. If a shader has this attribute, then all its callees must
match its value.

================================================ ==========================================================

Calling Conventions
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1263,12 +1263,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
>;

// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPR : SubtargetFeature <"dynamic-vgpr",
"DynamicVGPR",
"true",
"Enable dynamic VGPR mode"
>;

// FIXME: Remove after all users are migrated to attribute.
def FeatureDynamicVGPRBlockSize32 : SubtargetFeature<"dynamic-vgpr-block-size-32",
"DynamicVGPRBlockSize32",
"true",
Expand Down
30 changes: 19 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -450,15 +450,17 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
unsigned MaxWaves = MFI.getMaxWavesPerEU();
uint64_t TotalNumVgpr =
getTotalNumVGPRs(STM.hasGFX90AInsts(), NumAgpr, NumVgpr);
uint64_t NumVGPRsForWavesPerEU = std::max(
{TotalNumVgpr, (uint64_t)1, (uint64_t)STM.getMinNumVGPRs(MaxWaves)});
uint64_t NumVGPRsForWavesPerEU =
std::max({TotalNumVgpr, (uint64_t)1,
(uint64_t)STM.getMinNumVGPRs(
MaxWaves, MFI.getDynamicVGPRBlockSize())});
uint64_t NumSGPRsForWavesPerEU = std::max(
{NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
STM.getOccupancyWithWorkGroupSizes(*MF).second,
MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext), STM,
OutContext);
MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
MFI.getDynamicVGPRBlockSize(), STM, OutContext);
uint64_t Occupancy;

const auto [MinWEU, MaxWEU] = AMDGPU::getIntegerPairAttribute(
Expand Down Expand Up @@ -1071,7 +1073,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
Ctx);
ProgInfo.NumVGPRsForWavesPerEU =
AMDGPUMCExpr::createMax({ProgInfo.NumVGPR, CreateExpr(1ul),
CreateExpr(STM.getMinNumVGPRs(MaxWaves))},
CreateExpr(STM.getMinNumVGPRs(
MaxWaves, MFI->getDynamicVGPRBlockSize()))},
Ctx);

if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
Expand Down Expand Up @@ -1245,7 +1248,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
STM.computeOccupancy(F, ProgInfo.LDSSize).second,
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU, STM, Ctx);
ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
MFI->getDynamicVGPRBlockSize(), STM, Ctx);

const auto [MinWEU, MaxWEU] =
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
Expand Down Expand Up @@ -1394,7 +1398,8 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
// Helper function to add common PAL Metadata 3.0+
static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
const SIProgramInfo &CurrentProgramInfo,
CallingConv::ID CC, const GCNSubtarget &ST) {
CallingConv::ID CC, const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) {
if (ST.hasIEEEMode())
MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);

Expand All @@ -1406,7 +1411,7 @@ static void EmitPALMetadataCommon(AMDGPUPALMetadata *MD,
(bool)CurrentProgramInfo.TrapHandlerEnable);
MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);

if (ST.isDynamicVGPREnabled())
if (DynamicVGPRBlockSize != 0)
MD->setComputeRegisters(".dynamic_vgpr_en", true);
}

Expand All @@ -1433,7 +1438,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
// For targets that support dynamic VGPRs, set the number of saved dynamic
// VGPRs (if any) in the PAL metadata.
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
if (STM.isDynamicVGPREnabled() &&
if (MFI->isDynamicVGPREnabled() &&
MFI->getScratchReservedForDynamicVGPRs() > 0)
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
MFI->getScratchReservedForDynamicVGPRs() / 4);
Expand All @@ -1459,7 +1464,8 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
MD->setHwStage(CC, ".scratch_en", msgpack::Type::Boolean,
CurrentProgramInfo.ScratchEnable);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM);
EmitPALMetadataCommon(MD, CurrentProgramInfo, CC, STM,
MFI->getDynamicVGPRBlockSize());
}

// ScratchSize is in bytes, 16 aligned.
Expand Down Expand Up @@ -1530,7 +1536,9 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
MD->setRsrc2(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getComputePGMRSrc2(Ctx), Ctx);
} else {
EmitPALMetadataCommon(MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST);
EmitPALMetadataCommon(
MD, CurrentProgramInfo, CallingConv::AMDGPU_CS, ST,
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize());
}

// Set optional info
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,8 +201,16 @@ static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
return 128;

const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);

unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
// Temporarily check both the attribute and the subtarget feature, until the
// latter is removed.
if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();

unsigned MaxVGPRs = ST.getMaxNumVGPRs(
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
DynamicVGPRBlockSize);

// A non-entry function has only 32 caller preserved registers.
// Do not promote alloca which will force spilling unless we know the function
Expand Down
38 changes: 24 additions & 14 deletions llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,10 @@ void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// TODO: assert Regions are sorted descending by pressure
const auto &ST = MF.getSubtarget<GCNSubtarget>();
const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
const unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();
const auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);
LLVM_DEBUG(dbgs() << "Trying to improve occupancy, target = " << TargetOcc
<< ", current = " << Occ << '\n');

Expand All @@ -457,7 +460,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
// Always build the DAG to add mutations
BuildDAG DAG(*R, *this);

if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
if (R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >= NewOcc)
continue;

LLVM_DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
Expand All @@ -468,7 +471,7 @@ unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
LLVM_DEBUG(dbgs() << "Occupancy improvement attempt:\n";
printSchedRP(dbgs(), R->MaxPressure, MaxRP));

NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST, DynamicVGPRBlockSize));
if (NewOcc <= Occ)
break;

Expand All @@ -489,9 +492,11 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);

bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Expand Down Expand Up @@ -522,19 +527,21 @@ void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
const auto RP = getRegionPressure(*R);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

if (RP.getOccupancy(ST) < TgtOcc) {
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
} else {
LLVM_DEBUG(dbgs() << ", restoring\n");
Ovr.restoreOrder();
assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
assert(R->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize) >=
TgtOcc);
}
}
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
}
MFI->limitOccupancy(FinalOccupancy);
Expand Down Expand Up @@ -580,9 +587,11 @@ void GCNIterativeScheduler::scheduleILP(
const auto &ST = MF.getSubtarget<GCNSubtarget>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
auto TgtOcc = MFI->getMinAllowedOccupancy();
unsigned DynamicVGPRBlockSize = MFI->getDynamicVGPRBlockSize();

sortRegionsByPressure(TgtOcc);
auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
auto Occ =
Regions.front()->MaxPressure.getOccupancy(ST, DynamicVGPRBlockSize);

bool IsReentry = false;
if (TryMaximizeOccupancy && Occ < TgtOcc) {
Expand All @@ -603,17 +612,18 @@ void GCNIterativeScheduler::scheduleILP(
const auto RP = getSchedulePressure(*R, ILPSchedule);
LLVM_DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));

if (RP.getOccupancy(ST) < TgtOcc) {
if (RP.getOccupancy(ST, DynamicVGPRBlockSize) < TgtOcc) {
LLVM_DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
if (R->BestSchedule.get() &&
R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
if (R->BestSchedule.get() && R->BestSchedule->MaxPressure.getOccupancy(
ST, DynamicVGPRBlockSize) >= TgtOcc) {
LLVM_DEBUG(dbgs() << ", scheduling minimal register\n");
scheduleBest(*R);
}
} else {
scheduleRegion(*R, ILPSchedule, RP);
LLVM_DEBUG(printSchedResult(dbgs(), R, RP));
FinalOccupancy = std::min(FinalOccupancy, RP.getOccupancy(ST));
FinalOccupancy =
std::min(FinalOccupancy, RP.getOccupancy(ST, DynamicVGPRBlockSize));
}
}
MFI->limitOccupancy(FinalOccupancy);
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,9 @@ bool GCNNSAReassignImpl::run(MachineFunction &MF) {

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
MaxNumVGPRs = std::min(
ST->getMaxNumVGPRs(MFI->getOccupancy(), MFI->getDynamicVGPRBlockSize()),
MaxNumVGPRs);
CSRegs = MRI->getCalleeSavedRegs();

using Candidate = std::pair<const MachineInstr*, bool>;
Expand Down
24 changes: 15 additions & 9 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"

using namespace llvm;
Expand Down Expand Up @@ -74,17 +75,20 @@ void GCNRegPressure::inc(unsigned Reg,
bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
unsigned MaxOccupancy) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned DynamicVGPRBlockSize =
MF.getInfo<SIMachineFunctionInfo>()->getDynamicVGPRBlockSize();

const auto SGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(getSGPRNum()));
const auto VGPROcc =
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
const auto VGPROcc = std::min(
MaxOccupancy, ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
const auto OtherSGPROcc = std::min(MaxOccupancy,
ST.getOccupancyWithNumSGPRs(O.getSGPRNum()));
const auto OtherVGPROcc =
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts())));
std::min(MaxOccupancy,
ST.getOccupancyWithNumVGPRs(O.getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));

const auto Occ = std::min(SGPROcc, VGPROcc);
const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
Expand Down Expand Up @@ -206,21 +210,23 @@ bool GCNRegPressure::less(const MachineFunction &MF, const GCNRegPressure &O,
O.getVGPRNum(ST.hasGFX90AInsts()));
}

Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
return Printable([&RP, ST](raw_ostream &OS) {
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize) {
return Printable([&RP, ST, DynamicVGPRBlockSize](raw_ostream &OS) {
OS << "VGPRs: " << RP.getArchVGPRNum() << ' '
<< "AGPRs: " << RP.getAGPRNum();
if (ST)
OS << "(O"
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()))
<< ST->getOccupancyWithNumVGPRs(RP.getVGPRNum(ST->hasGFX90AInsts()),
DynamicVGPRBlockSize)
<< ')';
OS << ", SGPRs: " << RP.getSGPRNum();
if (ST)
OS << "(O" << ST->getOccupancyWithNumSGPRs(RP.getSGPRNum()) << ')';
OS << ", LVGPR WT: " << RP.getVGPRTuplesWeight()
<< ", LSGPR WT: " << RP.getSGPRTuplesWeight();
if (ST)
OS << " -> Occ: " << RP.getOccupancy(*ST);
OS << " -> Occ: " << RP.getOccupancy(*ST, DynamicVGPRBlockSize);
OS << '\n';
});
}
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/AMDGPU/GCNRegPressure.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,22 @@ struct GCNRegPressure {
}
unsigned getSGPRTuplesWeight() const { return Value[TOTAL_KINDS + SGPR]; }

unsigned getOccupancy(const GCNSubtarget &ST) const {
unsigned getOccupancy(const GCNSubtarget &ST,
unsigned DynamicVGPRBlockSize) const {
return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()),
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts())));
ST.getOccupancyWithNumVGPRs(getVGPRNum(ST.hasGFX90AInsts()),
DynamicVGPRBlockSize));
}

void inc(unsigned Reg,
LaneBitmask PrevMask,
LaneBitmask NewMask,
const MachineRegisterInfo &MRI);

bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure& O) const {
return getOccupancy(ST) > O.getOccupancy(ST);
bool higherOccupancy(const GCNSubtarget &ST, const GCNRegPressure &O,
unsigned DynamicVGPRBlockSize) const {
return getOccupancy(ST, DynamicVGPRBlockSize) >
O.getOccupancy(ST, DynamicVGPRBlockSize);
}

/// Compares \p this GCNRegpressure to \p O, returning true if \p this is
Expand Down Expand Up @@ -133,7 +137,8 @@ struct GCNRegPressure {
friend GCNRegPressure max(const GCNRegPressure &P1,
const GCNRegPressure &P2);

friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST);
friend Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST,
unsigned DynamicVGPRBlockSize);
};

inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
Expand Down Expand Up @@ -402,7 +407,8 @@ GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
bool isEqual(const GCNRPTracker::LiveRegSet &S1,
const GCNRPTracker::LiveRegSet &S2);

Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr);
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST = nullptr,
unsigned DynamicVGPRBlockSize = 0);

Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI);
Expand Down
Loading
Loading