Skip to content

Commit

Permalink
[AMDGPU] Add utilities to track number of user SGPRs. NFC.
Browse files Browse the repository at this point in the history
Factor out and unify some common code that calculates and tracks the
number of user SGRPs.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D159439
  • Loading branch information
kerbowa committed Sep 12, 2023
1 parent 9048aa7 commit 343be51
Show file tree
Hide file tree
Showing 10 changed files with 229 additions and 159 deletions.
28 changes: 15 additions & 13 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,28 +393,29 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
uint16_t KernelCodeProperties = 0;
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();

if (MFI.hasPrivateSegmentBuffer()) {
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
if (MFI.hasDispatchPtr()) {
if (UserSGPRInfo.hasDispatchPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
if (MFI.hasKernargSegmentPtr()) {
if (UserSGPRInfo.hasKernargSegmentPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
}
if (MFI.hasDispatchID()) {
if (UserSGPRInfo.hasDispatchID()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
}
if (MFI.hasFlatScratchInit()) {
if (UserSGPRInfo.hasFlatScratchInit()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
Expand Down Expand Up @@ -1165,27 +1166,28 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));

if (MFI->hasPrivateSegmentBuffer()) {
const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Out.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}

if (MFI->hasDispatchPtr())
if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;

if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;

if (MFI->hasKernargSegmentPtr())
if (UserSGPRInfo.hasKernargSegmentPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;

if (MFI->hasDispatchID())
if (UserSGPRInfo.hasDispatchID())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;

if (MFI->hasFlatScratchInit())
if (UserSGPRInfo.hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;

if (MFI->hasDispatchPtr())
if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;

if (STM.isXNACKEnabled())
Expand Down
18 changes: 10 additions & 8 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,27 +455,28 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info.hasPrivateSegmentBuffer()) {
const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}

if (Info.hasDispatchPtr()) {
if (UserSGPRInfo.hasDispatchPtr()) {
Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}

const Module *M = MF.getFunction().getParent();
if (Info.hasQueuePtr() &&
if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}

if (Info.hasKernargSegmentPtr()) {
if (UserSGPRInfo.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Expand All @@ -486,13 +487,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(InputPtrReg);
}

if (Info.hasDispatchID()) {
if (UserSGPRInfo.hasDispatchID()) {
Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}

if (Info.hasFlatScratchInit()) {
if (UserSGPRInfo.hasFlatScratchInit()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
Expand Down Expand Up @@ -597,15 +598,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(

SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();

if (Info->hasImplicitBufferPtr()) {
if (UserSGPRInfo.hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}

// FIXME: This probably isn't defined for mesa
if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}

if (MFI.hasQueuePtr())
if (MFI.getUserSGPRInfo().hasQueuePtr())
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
//
// If we only have implicit uses of flat_scr on flat instructions, it is not
// really needed.
if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
Expand Down
103 changes: 88 additions & 15 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
#include "GCNSubtarget.h"
#include "R600Subtarget.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
Expand Down Expand Up @@ -692,7 +693,7 @@ GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {

unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
}

unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
Expand Down Expand Up @@ -770,25 +771,27 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
getReservedNumSGPRs(MF));
}

static unsigned getMaxNumPreloadedSGPRs() {
static constexpr unsigned getMaxNumPreloadedSGPRs() {
using USI = GCNUserSGPRUsageInfo;
// Max number of user SGPRs
unsigned MaxUserSGPRs = 4 + // private segment buffer
2 + // Dispatch ptr
2 + // queue ptr
2 + // kernel segment ptr
2 + // dispatch ID
2 + // flat scratch init
2; // Implicit buffer ptr
const unsigned MaxUserSGPRs =
USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
USI::getNumUserSGPRForField(USI::DispatchPtrID) +
USI::getNumUserSGPRForField(USI::QueuePtrID) +
USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
USI::getNumUserSGPRForField(USI::DispatchIdID) +
USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);

// Max number of system SGPRs
unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
1 + // WorkGroupIDY
1 + // WorkGroupIDZ
1 + // WorkGroupInfo
1; // private segment wave byte offset
const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
1 + // WorkGroupIDY
1 + // WorkGroupIDZ
1 + // WorkGroupInfo
1; // private segment wave byte offset

// Max number of synthetic SGPRs
unsigned SyntheticSGPRs = 1; // LDSKernelId
const unsigned SyntheticSGPRs = 1; // LDSKernelId

return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
}
Expand Down Expand Up @@ -1018,3 +1021,73 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
else
return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
}

GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const GCNSubtarget &ST) {
const CallingConv::ID CC = F.getCallingConv();
const bool IsKernel =
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
// FIXME: Should have analysis or something rather than attribute to detect
// calls.
const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
// FIXME: This attribute is a hack, we just need an analysis on the function
// to look for allocas.
const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");

if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
KernargSegmentPtr = true;

bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
PrivateSegmentBuffer = true;
else if (ST.isMesaGfxShader(F))
ImplicitBufferPtr = true;

if (!AMDGPU::isGraphics(CC)) {
if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
DispatchPtr = true;

// FIXME: Can this always be disabled with < COv5?
if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
QueuePtr = true;

if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
DispatchID = true;
}

// TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls or stack objects that may require it before argument
// lowering.
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
(HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
!ST.flatScratchIsArchitected()) {
FlatScratchInit = true;
}
}

unsigned GCNUserSGPRUsageInfo::getNumUsedUserSGPRs() const {
unsigned NumUserSGPRs = 0;
if (hasImplicitBufferPtr())
NumUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);

if (hasPrivateSegmentBuffer())
NumUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);

if (hasDispatchPtr())
NumUserSGPRs += getNumUserSGPRForField(DispatchPtrID);

if (hasQueuePtr())
NumUserSGPRs += getNumUserSGPRForField(QueuePtrID);

if (hasKernargSegmentPtr())
NumUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);

if (hasDispatchID())
NumUserSGPRs += getNumUserSGPRForField(DispatchIdID);

if (hasFlatScratchInit())
NumUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);

return NumUserSGPRs;
}
74 changes: 74 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "SIInstrInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#include "llvm/Support/ErrorHandling.h"

#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
Expand Down Expand Up @@ -1378,6 +1379,79 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
}
};

class GCNUserSGPRUsageInfo {
public:
unsigned getNumUsedUserSGPRs() const;

bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }

bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }

bool hasDispatchPtr() const { return DispatchPtr; }

bool hasQueuePtr() const { return QueuePtr; }

bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }

bool hasDispatchID() const { return DispatchID; }

bool hasFlatScratchInit() const { return FlatScratchInit; }

enum UserSGPRID : unsigned {
ImplicitBufferPtrID = 0,
PrivateSegmentBufferID = 1,
DispatchPtrID = 2,
QueuePtrID = 3,
KernargSegmentPtrID = 4,
DispatchIdID = 5,
FlatScratchInitID = 6,
PrivateSegmentSizeID = 7
};

// Returns the size in number of SGPRs for preload user SGPR field.
static constexpr unsigned getNumUserSGPRForField(UserSGPRID ID) {
switch (ID) {
case ImplicitBufferPtrID:
return 2;
case PrivateSegmentBufferID:
return 4;
case DispatchPtrID:
return 2;
case QueuePtrID:
return 2;
case KernargSegmentPtrID:
return 2;
case DispatchIdID:
return 2;
case FlatScratchInitID:
return 2;
case PrivateSegmentSizeID:
return 1;
}
llvm_unreachable("Unknown UserSGPRID.");
}

GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);

private:
// Private memory buffer
// Compute directly in sgpr[0:1]
// Other shaders indirect 64-bits at sgpr[0:1]
bool ImplicitBufferPtr = false;

bool PrivateSegmentBuffer = false;

bool DispatchPtr = false;

bool QueuePtr = false;

bool KernargSegmentPtr = false;

bool DispatchID = false;

bool FlatScratchInit = false;
};

} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
Loading

0 comments on commit 343be51

Please sign in to comment.