Skip to content

Commit 7f5d59b

Browse files
authored
[AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic (#68186)
The @llvm.amdgcn.cs.chain intrinsic is essentially a call. The call parameters are bundled up into 2 intrinsic arguments, one for those that should go in the SGPRs (the 3rd intrinsic argument), and one for those that should go in the VGPRs (the 4th intrinsic argument). Both will often be some kind of aggregate. Both instruction selection frameworks have some internal representation for intrinsics (G_INTRINSIC[_WITH_SIDE_EFFECTS] for GlobalISel, ISD::INTRINSIC_[VOID|WITH_CHAIN] for DAGISel), but we can't use those because aggregates are dissolved very early on during ISel and we'd lose the inreg information. Therefore, this patch shortcircuits both the IRTranslator and SelectionDAGBuilder to lower this intrinsic as a call from the very start. It tries to use the existing infrastructure as much as possible, by calling into the code for lowering tail calls. This has already gone through a few rounds of review in Phab: Differential Revision: https://reviews.llvm.org/D153761
1 parent 838331a commit 7f5d59b

16 files changed

+2736
-91
lines changed

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
#include "llvm/IR/Instructions.h"
6363
#include "llvm/IR/IntrinsicInst.h"
6464
#include "llvm/IR/Intrinsics.h"
65+
#include "llvm/IR/IntrinsicsAMDGPU.h"
6566
#include "llvm/IR/LLVMContext.h"
6667
#include "llvm/IR/Metadata.h"
6768
#include "llvm/IR/PatternMatch.h"
@@ -2390,6 +2391,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
23902391
Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
23912392
return CLI->lowerCall(MIRBuilder, Info);
23922393
}
2394+
case Intrinsic::amdgcn_cs_chain:
2395+
return translateCallBase(CI, MIRBuilder);
23932396
case Intrinsic::fptrunc_round: {
23942397
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);
23952398

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
#include "llvm/IR/IntrinsicInst.h"
7777
#include "llvm/IR/Intrinsics.h"
7878
#include "llvm/IR/IntrinsicsAArch64.h"
79+
#include "llvm/IR/IntrinsicsAMDGPU.h"
7980
#include "llvm/IR/IntrinsicsWebAssembly.h"
8081
#include "llvm/IR/LLVMContext.h"
8182
#include "llvm/IR/Metadata.h"
@@ -7444,6 +7445,54 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
74447445
setValue(&I, Val);
74457446
return;
74467447
}
7448+
case Intrinsic::amdgcn_cs_chain: {
7449+
assert(I.arg_size() == 5 && "Additional args not supported yet");
7450+
assert(cast<ConstantInt>(I.getOperand(4))->isZero() &&
7451+
"Non-zero flags not supported yet");
7452+
7453+
// At this point we don't care if it's amdgpu_cs_chain or
7454+
// amdgpu_cs_chain_preserve.
7455+
CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain;
7456+
7457+
Type *RetTy = I.getType();
7458+
assert(RetTy->isVoidTy() && "Should not return");
7459+
7460+
SDValue Callee = getValue(I.getOperand(0));
7461+
7462+
// We only have 2 actual args: one for the SGPRs and one for the VGPRs.
7463+
// We'll also tack the value of the EXEC mask at the end.
7464+
TargetLowering::ArgListTy Args;
7465+
Args.reserve(3);
7466+
7467+
for (unsigned Idx : {2, 3, 1}) {
7468+
TargetLowering::ArgListEntry Arg;
7469+
Arg.Node = getValue(I.getOperand(Idx));
7470+
Arg.Ty = I.getOperand(Idx)->getType();
7471+
Arg.setAttributes(&I, Idx);
7472+
Args.push_back(Arg);
7473+
}
7474+
7475+
assert(Args[0].IsInReg && "SGPR args should be marked inreg");
7476+
assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
7477+
Args[2].IsInReg = true; // EXEC should be inreg
7478+
7479+
TargetLowering::CallLoweringInfo CLI(DAG);
7480+
CLI.setDebugLoc(getCurSDLoc())
7481+
.setChain(getRoot())
7482+
.setCallee(CC, RetTy, Callee, std::move(Args))
7483+
.setNoReturn(true)
7484+
.setTailCall(true)
7485+
.setConvergent(I.isConvergent());
7486+
CLI.CB = &I;
7487+
std::pair<SDValue, SDValue> Result =
7488+
lowerInvokable(CLI, /*EHPadBB*/ nullptr);
7489+
(void)Result;
7490+
assert(!Result.first.getNode() && !Result.second.getNode() &&
7491+
"Should've lowered as tail call");
7492+
7493+
HasTailCall = true;
7494+
return;
7495+
}
74477496
case Intrinsic::ptrmask: {
74487497
SDValue Ptr = getValue(I.getOperand(0));
74497498
SDValue Mask = getValue(I.getOperand(1));

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 101 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -961,12 +961,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
961961
}
962962

963963
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
964-
bool IsTailCall, CallingConv::ID CC) {
965-
assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
966-
"because the address can be divergent");
964+
bool IsTailCall, bool isWave32,
965+
CallingConv::ID CC) {
966+
// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
967+
assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
968+
"Indirect calls can't be tail calls, "
969+
"because the address can be divergent");
967970
if (!IsTailCall)
968971
return AMDGPU::G_SI_CALL;
969972

973+
if (AMDGPU::isChainCC(CC))
974+
return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
975+
970976
return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
971977
AMDGPU::SI_TCRETURN;
972978
}
@@ -1154,14 +1160,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
11541160
void AMDGPUCallLowering::handleImplicitCallArguments(
11551161
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
11561162
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1163+
CallingConv::ID CalleeCC,
11571164
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
11581165
if (!ST.enableFlatScratch()) {
11591166
// Insert copies for the SRD. In the HSA case, this should be an identity
11601167
// copy.
11611168
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
11621169
FuncInfo.getScratchRSrcReg());
1163-
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1164-
CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1170+
1171+
auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
1172+
? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1173+
: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1174+
1175+
MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
1176+
CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
11651177
}
11661178

11671179
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
@@ -1193,7 +1205,8 @@ bool AMDGPUCallLowering::lowerTailCall(
11931205
if (!IsSibCall)
11941206
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
11951207

1196-
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
1208+
unsigned Opc =
1209+
getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
11971210
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
11981211
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
11991212
return false;
@@ -1202,8 +1215,27 @@ bool AMDGPUCallLowering::lowerTailCall(
12021215
// be 0.
12031216
MIB.addImm(0);
12041217

1205-
// Tell the call which registers are clobbered.
1218+
// If this is a chain call, we need to pass in the EXEC mask.
12061219
const SIRegisterInfo *TRI = ST.getRegisterInfo();
1220+
if (AMDGPU::isChainCC(Info.CallConv)) {
1221+
ArgInfo ExecArg = Info.OrigArgs[1];
1222+
assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
1223+
1224+
if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
1225+
return false;
1226+
1227+
if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
1228+
MIB.addImm(CI->getSExtValue());
1229+
} else {
1230+
MIB.addReg(ExecArg.Regs[0]);
1231+
unsigned Idx = MIB->getNumOperands() - 1;
1232+
MIB->getOperand(Idx).setReg(constrainOperandRegClass(
1233+
MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
1234+
MIB->getDesc(), MIB->getOperand(Idx), Idx));
1235+
}
1236+
}
1237+
1238+
// Tell the call which registers are clobbered.
12071239
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
12081240
MIB.addRegMask(Mask);
12091241

@@ -1257,7 +1289,8 @@ bool AMDGPUCallLowering::lowerTailCall(
12571289
// after the ordinary user argument registers.
12581290
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
12591291

1260-
if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1292+
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1293+
!AMDGPU::isChainCC(Info.CallConv)) {
12611294
// With a fixed ABI, allocate fixed registers before user arguments.
12621295
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
12631296
return false;
@@ -1273,7 +1306,8 @@ bool AMDGPUCallLowering::lowerTailCall(
12731306
if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
12741307
return false;
12751308

1276-
handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
1309+
handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
1310+
ImplicitArgRegs);
12771311

12781312
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
12791313
// sequence start and end here.
@@ -1307,8 +1341,62 @@ bool AMDGPUCallLowering::lowerTailCall(
13071341
return true;
13081342
}
13091343

1344+
/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1345+
bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
1346+
CallLoweringInfo &Info) const {
1347+
ArgInfo Callee = Info.OrigArgs[0];
1348+
ArgInfo SGPRArgs = Info.OrigArgs[2];
1349+
ArgInfo VGPRArgs = Info.OrigArgs[3];
1350+
ArgInfo Flags = Info.OrigArgs[4];
1351+
1352+
assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
1353+
"Non-zero flags aren't supported yet.");
1354+
assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
1355+
1356+
MachineFunction &MF = MIRBuilder.getMF();
1357+
const Function &F = MF.getFunction();
1358+
const DataLayout &DL = F.getParent()->getDataLayout();
1359+
1360+
// The function to jump to is actually the first argument, so we'll change the
1361+
// Callee and other info to match that before using our existing helper.
1362+
const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
1363+
if (const Function *F = dyn_cast<Function>(CalleeV)) {
1364+
Info.Callee = MachineOperand::CreateGA(F, 0);
1365+
Info.CallConv = F->getCallingConv();
1366+
} else {
1367+
assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
1368+
Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
1369+
Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1370+
// behaves the same here.
1371+
}
1372+
1373+
// The function that we're calling cannot be vararg (only the intrinsic is).
1374+
Info.IsVarArg = false;
1375+
1376+
assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
1377+
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1378+
"SGPR arguments should be marked inreg");
1379+
assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
1380+
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
1381+
"VGPR arguments should not be marked inreg");
1382+
1383+
SmallVector<ArgInfo, 8> OutArgs;
1384+
splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
1385+
splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
1386+
1387+
Info.IsMustTailCall = true;
1388+
return lowerTailCall(MIRBuilder, Info, OutArgs);
1389+
}
1390+
13101391
bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
13111392
CallLoweringInfo &Info) const {
1393+
if (Function *F = Info.CB->getCalledFunction())
1394+
if (F->isIntrinsic()) {
1395+
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
1396+
"Unexpected intrinsic");
1397+
return lowerChainCall(MIRBuilder, Info);
1398+
}
1399+
13121400
if (Info.IsVarArg) {
13131401
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
13141402
return false;
@@ -1357,7 +1445,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
13571445

13581446
// Create a temporarily-floating call instruction so we can add the implicit
13591447
// uses of arg registers.
1360-
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
1448+
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
1449+
Info.CallConv);
13611450

13621451
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
13631452
MIB.addDef(TRI->getReturnAddressReg(MF));
@@ -1399,7 +1488,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
13991488

14001489
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
14011490

1402-
handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
1491+
handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
1492+
ImplicitArgRegs);
14031493

14041494
// Get a count of how many bytes are to be pushed on the stack.
14051495
unsigned NumBytes = CCInfo.getStackSize();

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,13 @@ class AMDGPUCallLowering final : public CallLowering {
7575
void handleImplicitCallArguments(
7676
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
7777
const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
78+
CallingConv::ID CalleeCC,
7879
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;
7980

8081
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
8182
SmallVectorImpl<ArgInfo> &OutArgs) const;
83+
bool lowerChainCall(MachineIRBuilder &MIRBuilder,
84+
CallLoweringInfo &Info) const;
8285
bool lowerCall(MachineIRBuilder &MIRBuilder,
8386
CallLoweringInfo &Info) const override;
8487

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5212,6 +5212,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
52125212
NODE_NAME_CASE(CALL)
52135213
NODE_NAME_CASE(TC_RETURN)
52145214
NODE_NAME_CASE(TC_RETURN_GFX)
5215+
NODE_NAME_CASE(TC_RETURN_CHAIN)
52155216
NODE_NAME_CASE(TRAP)
52165217
NODE_NAME_CASE(RET_GLUE)
52175218
NODE_NAME_CASE(WAVE_ADDRESS)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ enum NodeType : unsigned {
389389
CALL,
390390
TC_RETURN,
391391
TC_RETURN_GFX,
392+
TC_RETURN_CHAIN,
392393
TRAP,
393394

394395
// Masked control flow nodes.

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
9494
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
9595
>;
9696

97+
def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
98+
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
99+
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
100+
>;
101+
97102
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
98103
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
99104
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]

0 commit comments

Comments
 (0)