Skip to content

Commit 44556e6

Browse files
authored
[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic (#105822)
This intrinsic is meant to be used in functions that have a "tail" that needs to be run with all the lanes enabled. The "tail" may contain complex control flow that makes it unsuitable for the use of the existing WWM intrinsics. Instead, we will pretend that the function starts with all the lanes enabled, then branches into the actual body of the function for the lanes that were meant to run it, and then finally all the lanes will rejoin and run the tail. As such, the intrinsic will return the EXEC mask for the body of the function, and is meant to be used only as part of a very limited pattern (for now only in amdgpu_cs_chain functions): ``` entry: %func_exec = call i1 @llvm.amdgcn.init.whole.wave() br i1 %func_exec, label %func, label %tail func: ; ... stuff that should run with the actual EXEC mask br label %tail tail: ; ... stuff that runs with all the lanes enabled; ; can contain more than one basic block ``` It's an error to use the result of this intrinsic for anything other than a branch (but unfortunately checking that in the verifier is non-trivial because SIAnnotateControlFlow will introduce an amdgcn.if between the intrinsic and the branch). The intrinsic is lowered to a SI_INIT_WHOLE_WAVE pseudo, which for now is expanded in si-wqm (which is where SI_INIT_EXEC is handled too); however the information that the function was conceptually started in whole wave mode is stored in the machine function info (hasInitWholeWave). This will be useful in prolog epilog insertion, where we can skip saving the inactive lanes for CSRs (since if the function started with all the lanes active, then there are no inactive lanes to preserve).
1 parent 925b220 commit 44556e6

22 files changed

+1524
-5
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,16 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
208208
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209209
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210210

211+
// Sets the function into whole-wave-mode and returns whether the lane was
212+
// active when entering the function. A branch depending on this return will
213+
// revert the EXEC mask to what it was when entering the function, thus
214+
// resulting in a no-op. This pattern is used to optimize branches when function
215+
// tails need to be run in whole-wave-mode. It may also have other consequences
216+
// (mostly related to WWM CSR handling) that differentiate it from using
217+
// a plain `amdgcn.init.exec -1`.
218+
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
219+
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
220+
211221
def int_amdgcn_wavefrontsize :
212222
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213223
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
27382738
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
27392739
SelectDSBvhStackIntrinsic(N);
27402740
return;
2741+
case Intrinsic::amdgcn_init_whole_wave:
2742+
CurDAG->getMachineFunction()
2743+
.getInfo<SIMachineFunctionInfo>()
2744+
->setInitWholeWave();
2745+
break;
27412746
}
27422747

27432748
SelectCode(N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
17721772
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
17731773
}
17741774

1775+
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1776+
MachineFunction *MF = MI.getParent()->getParent();
1777+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1778+
1779+
MFInfo->setInitWholeWave();
1780+
return selectImpl(MI, *CoverageInfo);
1781+
}
1782+
17751783
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
17761784
if (TM.getOptLevel() > CodeGenOptLevel::None) {
17771785
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20992107
return selectDSAppendConsume(I, true);
21002108
case Intrinsic::amdgcn_ds_consume:
21012109
return selectDSAppendConsume(I, false);
2110+
case Intrinsic::amdgcn_init_whole_wave:
2111+
return selectInitWholeWave(I);
21022112
case Intrinsic::amdgcn_s_barrier:
21032113
return selectSBarrier(I);
21042114
case Intrinsic::amdgcn_raw_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
120120
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
121121
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
122122
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
123+
bool selectInitWholeWave(MachineInstr &MI) const;
123124
bool selectSBarrier(MachineInstr &MI) const;
124125
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
125126

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6767
// Kernel may need limited waves per EU for better performance.
6868
bool WaveLimiter = false;
6969

70+
bool HasInitWholeWave = false;
71+
7072
public:
7173
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
7274

@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
109111
return WaveLimiter;
110112
}
111113

114+
bool hasInitWholeWave() const { return HasInitWholeWave; }
115+
void setInitWholeWave() { HasInitWholeWave = true; }
116+
112117
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
113118
return allocateLDSGlobal(DL, GV, DynLDSAlign);
114119
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49974997
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
49984998
break;
49994999
}
5000+
case Intrinsic::amdgcn_init_whole_wave:
50005001
case Intrinsic::amdgcn_live_mask: {
50015002
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
50025003
break;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
329329
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
330330
def : SourceOfDivergence<int_amdgcn_update_dpp>;
331331
def : SourceOfDivergence<int_amdgcn_writelane>;
332+
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
332333

333334
foreach intr = AMDGPUMFMAIntrinsics908 in
334335
def : SourceOfDivergence<intr>;

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
17391739
? DenormalMode::IEEE
17401740
: DenormalMode::PreserveSign;
17411741

1742+
if (YamlMFI.HasInitWholeWave)
1743+
MFI->setInitWholeWave();
1744+
17421745
return false;
17431746
}
17441747

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13431343

13441344
// Allocate spill slots for WWM reserved VGPRs.
13451345
// For chain functions, we only need to do this if we have calls to
1346-
// llvm.amdgcn.cs.chain.
1347-
bool IsChainWithoutCalls =
1348-
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349-
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1346+
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
1347+
// chain functions do not return) and the function did not contain a call to
1348+
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
1349+
// when entering the function).
1350+
bool IsChainWithoutRestores =
1351+
FuncInfo->isChainFunction() &&
1352+
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
1353+
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
13501354
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
13511355
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
13521356
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,16 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
583583
let Defs = [EXEC];
584584
}
585585

586+
// Sets EXEC to all lanes and returns the previous EXEC.
587+
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
588+
(outs SReg_1:$dst), (ins),
589+
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
590+
let Defs = [EXEC];
591+
let Uses = [EXEC];
592+
593+
let isConvergent = 1;
594+
}
595+
586596
// Return for returning shaders to a shader variant epilog.
587597
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
588598
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
289289
StringValue SGPRForEXECCopy;
290290
StringValue LongBranchReservedReg;
291291

292+
bool HasInitWholeWave = false;
293+
292294
SIMachineFunctionInfo() = default;
293295
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
294296
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
336338
StringValue()); // Don't print out when it's empty.
337339
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
338340
StringValue());
341+
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
339342
}
340343
};
341344

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
594594
KillInstrs.push_back(&MI);
595595
BBI.NeedsLowering = true;
596596
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
597-
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
597+
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
598+
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
598599
InitExecInstrs.push_back(&MI);
599600
} else if (WQMOutputs) {
600601
// The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,29 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
15821583
MachineBasicBlock *MBB = MI.getParent();
15831584
bool IsWave32 = ST->isWave32();
15841585

1586+
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1587+
assert(MBB == &MBB->getParent()->front() &&
1588+
"init whole wave not in entry block");
1589+
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1590+
MachineInstr *SaveExec =
1591+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1592+
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1593+
: AMDGPU::S_OR_SAVEEXEC_B64),
1594+
EntryExec)
1595+
.addImm(-1);
1596+
1597+
// Replace all uses of MI's destination reg with EntryExec.
1598+
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1599+
MI.eraseFromParent();
1600+
1601+
if (LIS) {
1602+
LIS->RemoveMachineInstrFromMaps(MI);
1603+
LIS->InsertMachineInstrInMaps(*SaveExec);
1604+
LIS->createAndComputeVirtRegInterval(EntryExec);
1605+
}
1606+
return;
1607+
}
1608+
15851609
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
15861610
// This should be before all vector instructions.
15871611
MachineInstr *InitMI =

0 commit comments

Comments
 (0)