Skip to content

Commit

Permalink
[AMDGPU] Add IR lowering changes for preloaded kernargs
Browse files Browse the repository at this point in the history
Preloaded kernel arguments should not be lowered in the IR pass
AMDGPULowerKernelArguments. Therefore it's necessary to calculate the
total number of user SGPRs that are available for preloading and how
many SGPRs would be required to preload each argument to determine
whether we should skip lowering i.e. the argument will be preloaded
instead.

Reviewed By: bcahoon

Differential Revision: https://reviews.llvm.org/D156853
  • Loading branch information
kerbowa committed Sep 25, 2023
1 parent 330fa7d commit 7b70af2
Show file tree
Hide file tree
Showing 2 changed files with 526 additions and 1 deletion.
58 changes: 57 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,58 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"

#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"

using namespace llvm;

namespace {

class PreloadKernelArgInfo {
private:
Function &F;
const GCNSubtarget &ST;
unsigned NumFreeUserSGPRs;

public:
SmallVector<llvm::Metadata *, 8> KernelArgMetadata;

PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
setInitialFreeUserSGPRsCount();
}

// Returns the maximum number of user SGPRs that we have available to preload
// arguments.
void setInitialFreeUserSGPRsCount() {
const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);

NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs();
}

bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
uint64_t LastExplicitArgOffset) {
// Check if this argument may be loaded into the same register as the
// previous argument.
if (!isAligned(Align(4), ArgOffset) && AllocSize < 4)
return true;

// Pad SGPRs for kernarg alignment.
unsigned Padding = ArgOffset - LastExplicitArgOffset;
unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
return false;

NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
return true;
}
};

class AMDGPULowerKernelArguments : public FunctionPass {
public:
static char ID;
Expand Down Expand Up @@ -84,6 +126,9 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));

uint64_t ExplicitArgOffset = 0;
// Preloaded kernel arguments must be sequential.
bool InPreloadSequence = true;
PreloadKernelArgInfo PreloadInfo(F, ST);

for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
Expand All @@ -95,8 +140,19 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);

uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
uint64_t LastExplicitArgOffset = ExplicitArgOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;

// Try to preload this argument into user SGPRs.
if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
!ST.needsKernargPreloadBackwardsCompatibility() &&
!Arg.getType()->isAggregateType())
if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
LastExplicitArgOffset))
continue;

InPreloadSequence = false;

if (Arg.use_empty())
continue;

Expand Down
Loading

0 comments on commit 7b70af2

Please sign in to comment.