Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1844,6 +1844,20 @@ The AMDGPU backend supports the following calling conventions:
..TODO::
Describe.

``amdgpu_gfx_whole_wave`` Used for AMD graphics targets. Functions with this calling convention
cannot be used as entry points. They must have an i1 as the first argument,
which will be mapped to the value of EXEC on entry into the function. Other
arguments will contain poison in their inactive lanes. Similarly, the return
value for the inactive lanes is poison.

The function will run with all lanes enabled, i.e. EXEC will be set to -1 in the
prologue and restored to its original value in the epilogue. The inactive lanes
will be preserved for all the registers used by the function. Active lanes only
will only be preserved for the callee saved registers.

In all other respects, functions with this calling convention behave like
``amdgpu_gfx`` functions.

``amdgpu_gs`` Used for Mesa/AMDPAL geometry shaders.
..TODO::
Describe.
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/AsmParser/LLToken.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ enum Kind {
kw_amdgpu_cs_chain_preserve,
kw_amdgpu_kernel,
kw_amdgpu_gfx,
kw_amdgpu_gfx_whole_wave,
kw_tailcc,
kw_m68k_rtdcc,
kw_graalcc,
Expand Down
8 changes: 8 additions & 0 deletions llvm/include/llvm/IR/CallingConv.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,9 @@ namespace CallingConv {
RISCV_VLSCall_32768 = 122,
RISCV_VLSCall_65536 = 123,

// Calling convention for AMDGPU whole wave functions.
AMDGPU_Gfx_WholeWave = 124,

/// The highest possible ID. Must be some 2^k - 1.
MaxID = 1023
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Failed to increase the MaxID.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MaxID in this enumeration doesn't change for new CCs; unless you meant something else?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah my bad. I missed the zero in it. Never mind.

};
Expand All @@ -294,8 +297,13 @@ namespace CallingConv {
/// directly or indirectly via a call-like instruction.
constexpr bool isCallableCC(CallingConv::ID CC) {
switch (CC) {
// Called with special intrinsics:
// llvm.amdgcn.cs.chain
case CallingConv::AMDGPU_CS_Chain:
case CallingConv::AMDGPU_CS_ChainPreserve:
// llvm.amdgcn.call.whole.wave
case CallingConv::AMDGPU_Gfx_WholeWave:
// Hardware entry points:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/AsmParser/LLLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(amdgpu_cs_chain_preserve);
KEYWORD(amdgpu_kernel);
KEYWORD(amdgpu_gfx);
KEYWORD(amdgpu_gfx_whole_wave);
KEYWORD(tailcc);
KEYWORD(m68k_rtdcc);
KEYWORD(graalcc);
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/AsmParser/LLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2272,6 +2272,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) {
CC = CallingConv::AMDGPU_CS_ChainPreserve;
break;
case lltok::kw_amdgpu_kernel: CC = CallingConv::AMDGPU_KERNEL; break;
case lltok::kw_amdgpu_gfx_whole_wave:
CC = CallingConv::AMDGPU_Gfx_WholeWave;
break;
case lltok::kw_tailcc: CC = CallingConv::Tail; break;
case lltok::kw_m68k_rtdcc: CC = CallingConv::M68k_RTD; break;
case lltok::kw_graalcc: CC = CallingConv::GRAAL; break;
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/IR/AsmWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
break;
case CallingConv::AMDGPU_KERNEL: Out << "amdgpu_kernel"; break;
case CallingConv::AMDGPU_Gfx: Out << "amdgpu_gfx"; break;
case CallingConv::AMDGPU_Gfx_WholeWave:
Out << "amdgpu_gfx_whole_wave";
break;
case CallingConv::M68k_RTD: Out << "m68k_rtdcc"; break;
case CallingConv::RISCV_VectorCall:
Out << "riscv_vector_cc";
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/IR/Function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1232,6 +1232,7 @@ bool llvm::CallingConv::supportsNonVoidReturnType(CallingConv::ID CC) {
case CallingConv::AArch64_SVE_VectorCall:
case CallingConv::WASM_EmscriptenInvoke:
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
case CallingConv::M68k_INTR:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/IR/Verifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2979,6 +2979,16 @@ void Verifier::visitFunction(const Function &F) {
"perfect forwarding!",
&F);
break;
case CallingConv::AMDGPU_Gfx_WholeWave:
Check(!F.arg_empty() && F.arg_begin()->getType()->isIntegerTy(1),
"Calling convention requires first argument to be i1", &F);
Check(!F.arg_begin()->hasInRegAttr(),
"Calling convention requires first argument to not be inreg", &F);
Check(!F.isVarArg(),
"Calling convention does not support varargs or "
"perfect forwarding!",
&F);
break;
}

// Check that the argument values match the function type for this function...
Expand Down
32 changes: 29 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,15 +374,20 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B, const Value *Val,
return true;
}

unsigned ReturnOpc =
IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::SI_RETURN;
const bool IsWholeWave = MFI->isWholeWaveFunction();
unsigned ReturnOpc = IsWholeWave ? AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN
: IsShader ? AMDGPU::SI_RETURN_TO_EPILOG
: AMDGPU::SI_RETURN;
auto Ret = B.buildInstrNoInsert(ReturnOpc);

if (!FLI.CanLowerReturn)
insertSRetStores(B, Val->getType(), VRegs, FLI.DemoteRegister);
else if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;

if (IsWholeWave)
addOriginalExecToReturn(B.getMF(), Ret);

// TODO: Handle CalleeSavedRegsViaCopy.

B.insertInstr(Ret);
Expand Down Expand Up @@ -632,6 +637,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (DL.getTypeStoreSize(Arg.getType()) == 0)
continue;

if (Info->isWholeWaveFunction() && Idx == 0) {
assert(VRegs[Idx].size() == 1 && "Expected only one register");

// The first argument for whole wave functions is the original EXEC value.
B.buildInstr(AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP)
.addDef(VRegs[Idx][0]);

++Idx;
continue;
}

const bool InReg = Arg.hasAttribute(Attribute::InReg);

if (Arg.hasAttribute(Attribute::SwiftSelf) ||
Expand Down Expand Up @@ -1347,6 +1363,7 @@ bool AMDGPUCallLowering::lowerTailCall(
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
Expand Down Expand Up @@ -1524,7 +1541,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
Info.CallConv != CallingConv::AMDGPU_Gfx_WholeWave) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
Expand Down Expand Up @@ -1592,3 +1610,11 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

return true;
}

void AMDGPUCallLowering::addOriginalExecToReturn(
MachineFunction &MF, MachineInstrBuilder &Ret) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineInstr *Setup = TII->getWholeWaveFunctionSetup(MF);
Ret.addReg(Setup->getOperand(0).getReg());
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class AMDGPUCallLowering final : public CallLowering {
bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;

void addOriginalExecToReturn(MachineFunction &MF,
MachineInstrBuilder &Ret) const;

public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUGISel.td
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,10 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_PREFETCH, SIsbuffer_prefetch>;

def : GINodeEquiv<G_AMDGPU_WHOLE_WAVE_FUNC_SETUP, AMDGPUwhole_wave_setup>;
// G_AMDGPU_WHOLE_WAVE_FUNC_RETURN is simpler than AMDGPUwhole_wave_return,
// so we don't mark it as equivalent.

class GISelSop2Pat <
SDPatternOperator node,
Instruction inst,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1143,6 +1143,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::Cold:
return CC_AMDGPU_Func;
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
return CC_SI_Gfx;
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
Expand All @@ -1168,6 +1169,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_LS:
return RetCC_SI_Shader;
case CallingConv::AMDGPU_Gfx:
case CallingConv::AMDGPU_Gfx_WholeWave:
return RetCC_SI_Gfx;
case CallingConv::C:
case CallingConv::Fast:
Expand Down Expand Up @@ -5875,6 +5877,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
NODE_NAME_CASE(WHOLE_WAVE_SETUP)
NODE_NAME_CASE(WHOLE_WAVE_RETURN)
}
return nullptr;
}
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,12 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_FMAX,
BUFFER_ATOMIC_COND_SUB_U32,
LAST_MEMORY_OPCODE = BUFFER_ATOMIC_COND_SUB_U32,

// Set up a whole wave function.
WHOLE_WAVE_SETUP,

// Return from a whole wave function.
WHOLE_WAVE_RETURN,
};

} // End namespace AMDGPUISD
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",

def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;

// Marks the entry into a whole wave function.
def AMDGPUwhole_wave_setup : SDNode<
"AMDGPUISD::WHOLE_WAVE_SETUP", SDTypeProfile<1, 0, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPSideEffect]>;

// Marks the return from a whole wave function.
def AMDGPUwhole_wave_return : SDNode<
"AMDGPUISD::WHOLE_WAVE_RETURN", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

// SI+ export
def AMDGPUExportOp : SDTypeProfile<0, 8, [
SDTCisInt<0>, // i8 tgt
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4160,6 +4160,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN: {
I.setDesc(TII.get(AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN));
return true;
}
case AMDGPU::G_STACKRESTORE:
return selectStackRestore(I);
case AMDGPU::G_PHI:
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5540,6 +5540,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_PREFETCH:
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
break;
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_SETUP:
case AMDGPU::G_AMDGPU_WHOLE_WAVE_FUNC_RETURN:
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
}

return getInstructionMapping(/*ID*/1, /*Cost*/1,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3206,7 +3206,7 @@ bool GCNHazardRecognizer::fixRequiredExportPriority(MachineInstr *MI) {
// Check entry priority at each export (as there will only be a few).
// Note: amdgpu_gfx can only be a callee, so defer to caller setprio.
bool Changed = false;
if (CC != CallingConv::AMDGPU_Gfx)
if (CC != CallingConv::AMDGPU_Gfx && CC != CallingConv::AMDGPU_Gfx_WholeWave)
Changed = ensureEntrySetPrio(MF, NormalPriority, TII);

auto NextMI = std::next(It);
Expand Down
Loading