@@ -961,12 +961,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
961
961
}
962
962
963
963
static unsigned getCallOpcode (const MachineFunction &CallerF, bool IsIndirect,
964
- bool IsTailCall, CallingConv::ID CC) {
965
- assert (!(IsIndirect && IsTailCall) && " Indirect calls can't be tail calls, "
966
- " because the address can be divergent" );
964
+ bool IsTailCall, bool isWave32,
965
+ CallingConv::ID CC) {
966
+ // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
967
+ assert ((AMDGPU::isChainCC (CC) || !IsIndirect || !IsTailCall) &&
968
+ " Indirect calls can't be tail calls, "
969
+ " because the address can be divergent" );
967
970
if (!IsTailCall)
968
971
return AMDGPU::G_SI_CALL;
969
972
973
+ if (AMDGPU::isChainCC (CC))
974
+ return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
975
+
970
976
return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
971
977
AMDGPU::SI_TCRETURN;
972
978
}
@@ -1154,14 +1160,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
1154
1160
void AMDGPUCallLowering::handleImplicitCallArguments (
1155
1161
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
1156
1162
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
1163
+ CallingConv::ID CalleeCC,
1157
1164
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
1158
1165
if (!ST.enableFlatScratch ()) {
1159
1166
// Insert copies for the SRD. In the HSA case, this should be an identity
1160
1167
// copy.
1161
1168
auto ScratchRSrcReg = MIRBuilder.buildCopy (LLT::fixed_vector (4 , 32 ),
1162
1169
FuncInfo.getScratchRSrcReg ());
1163
- MIRBuilder.buildCopy (AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
1164
- CallInst.addReg (AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
1170
+
1171
+ auto CalleeRSrcReg = AMDGPU::isChainCC (CalleeCC)
1172
+ ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
1173
+ : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
1174
+
1175
+ MIRBuilder.buildCopy (CalleeRSrcReg, ScratchRSrcReg);
1176
+ CallInst.addReg (CalleeRSrcReg, RegState::Implicit);
1165
1177
}
1166
1178
1167
1179
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
@@ -1193,7 +1205,8 @@ bool AMDGPUCallLowering::lowerTailCall(
1193
1205
if (!IsSibCall)
1194
1206
CallSeqStart = MIRBuilder.buildInstr (AMDGPU::ADJCALLSTACKUP);
1195
1207
1196
- unsigned Opc = getCallOpcode (MF, Info.Callee .isReg (), true , CalleeCC);
1208
+ unsigned Opc =
1209
+ getCallOpcode (MF, Info.Callee .isReg (), true , ST.isWave32 (), CalleeCC);
1197
1210
auto MIB = MIRBuilder.buildInstrNoInsert (Opc);
1198
1211
if (!addCallTargetOperands (MIB, MIRBuilder, Info))
1199
1212
return false ;
@@ -1202,8 +1215,27 @@ bool AMDGPUCallLowering::lowerTailCall(
1202
1215
// be 0.
1203
1216
MIB.addImm (0 );
1204
1217
1205
- // Tell the call which registers are clobbered .
1218
+ // If this is a chain call, we need to pass in the EXEC mask .
1206
1219
const SIRegisterInfo *TRI = ST.getRegisterInfo ();
1220
+ if (AMDGPU::isChainCC (Info.CallConv )) {
1221
+ ArgInfo ExecArg = Info.OrigArgs [1 ];
1222
+ assert (ExecArg.Regs .size () == 1 && " Too many regs for EXEC" );
1223
+
1224
+ if (!ExecArg.Ty ->isIntegerTy (ST.getWavefrontSize ()))
1225
+ return false ;
1226
+
1227
+ if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue )) {
1228
+ MIB.addImm (CI->getSExtValue ());
1229
+ } else {
1230
+ MIB.addReg (ExecArg.Regs [0 ]);
1231
+ unsigned Idx = MIB->getNumOperands () - 1 ;
1232
+ MIB->getOperand (Idx).setReg (constrainOperandRegClass (
1233
+ MF, *TRI, MRI, *ST.getInstrInfo (), *ST.getRegBankInfo (), *MIB,
1234
+ MIB->getDesc (), MIB->getOperand (Idx), Idx));
1235
+ }
1236
+ }
1237
+
1238
+ // Tell the call which registers are clobbered.
1207
1239
const uint32_t *Mask = TRI->getCallPreservedMask (MF, CalleeCC);
1208
1240
MIB.addRegMask (Mask);
1209
1241
@@ -1257,7 +1289,8 @@ bool AMDGPUCallLowering::lowerTailCall(
1257
1289
// after the ordinary user argument registers.
1258
1290
SmallVector<std::pair<MCRegister, Register>, 12 > ImplicitArgRegs;
1259
1291
1260
- if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
1292
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
1293
+ !AMDGPU::isChainCC (Info.CallConv )) {
1261
1294
// With a fixed ABI, allocate fixed registers before user arguments.
1262
1295
if (!passSpecialInputs (MIRBuilder, CCInfo, ImplicitArgRegs, Info))
1263
1296
return false ;
@@ -1273,7 +1306,8 @@ bool AMDGPUCallLowering::lowerTailCall(
1273
1306
if (!handleAssignments (Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
1274
1307
return false ;
1275
1308
1276
- handleImplicitCallArguments (MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
1309
+ handleImplicitCallArguments (MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
1310
+ ImplicitArgRegs);
1277
1311
1278
1312
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
1279
1313
// sequence start and end here.
@@ -1307,8 +1341,62 @@ bool AMDGPUCallLowering::lowerTailCall(
1307
1341
return true ;
1308
1342
}
1309
1343
1344
+ // / Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
1345
+ bool AMDGPUCallLowering::lowerChainCall (MachineIRBuilder &MIRBuilder,
1346
+ CallLoweringInfo &Info) const {
1347
+ ArgInfo Callee = Info.OrigArgs [0 ];
1348
+ ArgInfo SGPRArgs = Info.OrigArgs [2 ];
1349
+ ArgInfo VGPRArgs = Info.OrigArgs [3 ];
1350
+ ArgInfo Flags = Info.OrigArgs [4 ];
1351
+
1352
+ assert (cast<ConstantInt>(Flags.OrigValue )->isZero () &&
1353
+ " Non-zero flags aren't supported yet." );
1354
+ assert (Info.OrigArgs .size () == 5 && " Additional args aren't supported yet." );
1355
+
1356
+ MachineFunction &MF = MIRBuilder.getMF ();
1357
+ const Function &F = MF.getFunction ();
1358
+ const DataLayout &DL = F.getParent ()->getDataLayout ();
1359
+
1360
+ // The function to jump to is actually the first argument, so we'll change the
1361
+ // Callee and other info to match that before using our existing helper.
1362
+ const Value *CalleeV = Callee.OrigValue ->stripPointerCasts ();
1363
+ if (const Function *F = dyn_cast<Function>(CalleeV)) {
1364
+ Info.Callee = MachineOperand::CreateGA (F, 0 );
1365
+ Info.CallConv = F->getCallingConv ();
1366
+ } else {
1367
+ assert (Callee.Regs .size () == 1 && " Too many regs for the callee" );
1368
+ Info.Callee = MachineOperand::CreateReg (Callee.Regs [0 ], false );
1369
+ Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
1370
+ // behaves the same here.
1371
+ }
1372
+
1373
+ // The function that we're calling cannot be vararg (only the intrinsic is).
1374
+ Info.IsVarArg = false ;
1375
+
1376
+ assert (std::all_of (SGPRArgs.Flags .begin (), SGPRArgs.Flags .end (),
1377
+ [](ISD::ArgFlagsTy F) { return F.isInReg (); }) &&
1378
+ " SGPR arguments should be marked inreg" );
1379
+ assert (std::none_of (VGPRArgs.Flags .begin (), VGPRArgs.Flags .end (),
1380
+ [](ISD::ArgFlagsTy F) { return F.isInReg (); }) &&
1381
+ " VGPR arguments should not be marked inreg" );
1382
+
1383
+ SmallVector<ArgInfo, 8 > OutArgs;
1384
+ splitToValueTypes (SGPRArgs, OutArgs, DL, Info.CallConv );
1385
+ splitToValueTypes (VGPRArgs, OutArgs, DL, Info.CallConv );
1386
+
1387
+ Info.IsMustTailCall = true ;
1388
+ return lowerTailCall (MIRBuilder, Info, OutArgs);
1389
+ }
1390
+
1310
1391
bool AMDGPUCallLowering::lowerCall (MachineIRBuilder &MIRBuilder,
1311
1392
CallLoweringInfo &Info) const {
1393
+ if (Function *F = Info.CB ->getCalledFunction ())
1394
+ if (F->isIntrinsic ()) {
1395
+ assert (F->getIntrinsicID () == Intrinsic::amdgcn_cs_chain &&
1396
+ " Unexpected intrinsic" );
1397
+ return lowerChainCall (MIRBuilder, Info);
1398
+ }
1399
+
1312
1400
if (Info.IsVarArg ) {
1313
1401
LLVM_DEBUG (dbgs () << " Variadic functions not implemented\n " );
1314
1402
return false ;
@@ -1357,7 +1445,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1357
1445
1358
1446
// Create a temporarily-floating call instruction so we can add the implicit
1359
1447
// uses of arg registers.
1360
- unsigned Opc = getCallOpcode (MF, Info.Callee .isReg (), false , Info.CallConv );
1448
+ unsigned Opc = getCallOpcode (MF, Info.Callee .isReg (), false , ST.isWave32 (),
1449
+ Info.CallConv );
1361
1450
1362
1451
auto MIB = MIRBuilder.buildInstrNoInsert (Opc);
1363
1452
MIB.addDef (TRI->getReturnAddressReg (MF));
@@ -1399,7 +1488,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
1399
1488
1400
1489
const SIMachineFunctionInfo *MFI = MF.getInfo <SIMachineFunctionInfo>();
1401
1490
1402
- handleImplicitCallArguments (MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
1491
+ handleImplicitCallArguments (MIRBuilder, MIB, ST, *MFI, Info.CallConv ,
1492
+ ImplicitArgRegs);
1403
1493
1404
1494
// Get a count of how many bytes are to be pushed on the stack.
1405
1495
unsigned NumBytes = CCInfo.getStackSize ();
0 commit comments