Skip to content

Commit 8fda219

Browse files
committed
[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic
This intrinsic is meant to be used in functions that have a "tail" that needs to be run with all the lanes enabled. The "tail" may contain complex control flow that makes it unsuitable for the use of the existing WWM intrinsics. Instead, we will pretend that the function starts with all the lanes enabled, then branches into the actual body of the function for the lanes that were meant to run it, and then finally all the lanes will rejoin and run the tail. As such, the intrinsic will return the EXEC mask for the body of the function, and is meant to be used only as part of a very limited pattern (for now only in amdgpu_cs_chain functions): ``` entry: %func_exec = call i1 @llvm.amdgcn.init.whole.wave() br i1 %func_exec, label %func, label %tail func: ; ... stuff that should run with the actual EXEC mask br label %tail tail: ; ... stuff that runs with all the lanes enabled; ; can contain more than one basic block ``` It's an error to use the result of this intrinsic for anything other than a branch (but unfortunately checking that in the verifier is non-trivial because SIAnnotateControlFlow will introduce an amdgcn.if between the intrinsic and the branch). Since the usage pattern is very strict, the backend can optimize away the intrinsic and the branch following it (in practice EXEC will already contain the correct value when entering the function, because it will be set by the llvm.amdgcn.cs.chain intrinsic before jumping in). The removal is done early on, in `finalizeLowering`, however the information that the function was conceptually started in whole wave mode is stored in the machine function info (hasInitWholeWave). This will be useful in prolog epilog insertion, where we can skip saving the inactive lanes for CSRs (since if the function started with all the lanes active, then there are no inactive lanes to preserve). Some of the generated code could use some more optimization. One important thing for front-ends to note is that for now it's recommended to avoid phi's in `tail` with large structs/vectors where not all elements are modified by `shader` - prefer to have small phi's and build the aggregate in `tail` (see the `basic` vs `phi-whole-struct` test cases).
1 parent 2b4b909 commit 8fda219

21 files changed

+2033
-4
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,20 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
208208
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209209
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210210

211+
// Sets the function into whole-wave-mode and returns whether the lane was
212+
// active when entering the function. A branch depending on this return will
213+
// revert the EXEC mask to what it was when entering the function, thus
214+
// resulting in a no-op. This pattern is used to optimize branches when function
215+
// tails need to be run in whole-wave-mode. It may also have other consequences
216+
// (mostly related to WWM CSR handling) that differentiate it from using
217+
// a plain `amdgcn.init.exec -1`.
218+
//
219+
// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
220+
// Using this intrinsic without immediately branching on its return value is an
221+
// error.
222+
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
223+
IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
224+
211225
def int_amdgcn_wavefrontsize :
212226
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213227
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
27382738
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
27392739
SelectDSBvhStackIntrinsic(N);
27402740
return;
2741+
case Intrinsic::amdgcn_init_whole_wave:
2742+
CurDAG->getMachineFunction()
2743+
.getInfo<SIMachineFunctionInfo>()
2744+
->setInitWholeWave();
2745+
break;
27412746
}
27422747

27432748
SelectCode(N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
17721772
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
17731773
}
17741774

1775+
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1776+
MachineFunction *MF = MI.getParent()->getParent();
1777+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1778+
1779+
MFInfo->setInitWholeWave();
1780+
return selectImpl(MI, *CoverageInfo);
1781+
}
1782+
17751783
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
17761784
if (TM.getOptLevel() > CodeGenOptLevel::None) {
17771785
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20992107
return selectDSAppendConsume(I, true);
21002108
case Intrinsic::amdgcn_ds_consume:
21012109
return selectDSAppendConsume(I, false);
2110+
case Intrinsic::amdgcn_init_whole_wave:
2111+
return selectInitWholeWave(I);
21022112
case Intrinsic::amdgcn_s_barrier:
21032113
return selectSBarrier(I);
21042114
case Intrinsic::amdgcn_raw_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
120120
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
121121
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
122122
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
123+
bool selectInitWholeWave(MachineInstr &MI) const;
123124
bool selectSBarrier(MachineInstr &MI) const;
124125
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
125126

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6767
// Kernel may need limited waves per EU for better performance.
6868
bool WaveLimiter = false;
6969

70+
bool HasInitWholeWave = false;
71+
7072
public:
7173
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
7274

@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
109111
return WaveLimiter;
110112
}
111113

114+
bool hasInitWholeWave() const { return HasInitWholeWave; }
115+
void setInitWholeWave() { HasInitWholeWave = true; }
116+
112117
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
113118
return allocateLDSGlobal(DL, GV, DynLDSAlign);
114119
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4979,6 +4979,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49794979
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
49804980
break;
49814981
}
4982+
case Intrinsic::amdgcn_init_whole_wave:
49824983
case Intrinsic::amdgcn_live_mask: {
49834984
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
49844985
break;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
329329
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
330330
def : SourceOfDivergence<int_amdgcn_update_dpp>;
331331
def : SourceOfDivergence<int_amdgcn_writelane>;
332+
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
332333

333334
foreach intr = AMDGPUMFMAIntrinsics908 in
334335
def : SourceOfDivergence<intr>;

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1731,6 +1731,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
17311731
? DenormalMode::IEEE
17321732
: DenormalMode::PreserveSign;
17331733

1734+
if (YamlMFI.HasInitWholeWave)
1735+
MFI->setInitWholeWave();
1736+
17341737
return false;
17351738
}
17361739

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13431343

13441344
// Allocate spill slots for WWM reserved VGPRs.
13451345
// For chain functions, we only need to do this if we have calls to
1346-
// llvm.amdgcn.cs.chain.
1347-
bool IsChainWithoutCalls =
1348-
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349-
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1346+
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
1347+
// chain functions do not return) and the function did not contain a call to
1348+
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
1349+
// when entering the function).
1350+
bool IsChainWithoutRestores =
1351+
FuncInfo->isChainFunction() &&
1352+
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
1353+
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
13501354
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
13511355
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
13521356
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15671,6 +15671,133 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
1567115671
}
1567215672
}
1567315673

15674+
static void removeInitWholeWaveBranch(MachineFunction &MF,
15675+
MachineRegisterInfo &MRI,
15676+
const SIInstrInfo *TII) {
15677+
// Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
15678+
// EXEC to -1 at SI_END_CF.
15679+
auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
15680+
return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
15681+
});
15682+
if (IWWIt == MF.begin()->instr_end())
15683+
return; // We've been here before (GISel runs finalizeLowering twice).
15684+
15685+
MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
15686+
assert(If.getOpcode() == AMDGPU::SI_IF &&
15687+
"Unexpected user for init.whole.wave result");
15688+
assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
15689+
"Expected simple control flow");
15690+
15691+
MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
15692+
MachineBasicBlock *EndBB = EndCf.getParent();
15693+
15694+
// Update all the Phis: since we're removing a predecessor, we need to remove
15695+
// the corresponding pair of operands. However, we can't just drop the value
15696+
// coming from the 'if' block - that's going to be the value of the inactive
15697+
// lanes.
15698+
// %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
15699+
// should become
15700+
// %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
15701+
// %v = v_set_inactive %t, %inactive
15702+
// Note that usually EndCf will be the first instruction after the phis and as
15703+
// such will serve as the end of the range when iterating over phis.
15704+
// Therefore, we shouldn't introduce any new instructions before it.
15705+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
15706+
auto AfterEndCf = std::next(EndCf.getIterator());
15707+
for (auto &Phi : EndBB->phis()) {
15708+
Register PhiDest = Phi.getOperand(0).getReg();
15709+
const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
15710+
15711+
Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
15712+
Phi.getOperand(0).setReg(NewPhiDest);
15713+
15714+
unsigned InactiveOpIdx = 0;
15715+
for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
15716+
if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
15717+
InactiveOpIdx = I;
15718+
break;
15719+
}
15720+
}
15721+
assert(InactiveOpIdx != 0 && "Broken phi?");
15722+
15723+
// At this point, the register class could be larger than 32 or 64, so we
15724+
// might have to use more than one V_SET_INACTIVE instruction.
15725+
unsigned Size = TRI.getRegSizeInBits(*PhiRC);
15726+
switch (Size) {
15727+
case 32:
15728+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15729+
TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
15730+
.addReg(NewPhiDest)
15731+
.add(Phi.getOperand(InactiveOpIdx));
15732+
break;
15733+
case 64:
15734+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15735+
TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
15736+
.addReg(NewPhiDest)
15737+
.add(Phi.getOperand(InactiveOpIdx));
15738+
break;
15739+
default: {
15740+
// For each 32-bit subregister of the register at InactiveOpIdx, insert
15741+
// a COPY to a new register, and a V_SET_INACTIVE_B32 using the
15742+
// corresponding subregisters of PhiDest and NewPhiDest.
15743+
// FIXME: There has to be a better way to iterate over this...
15744+
llvm::SmallVector<Register, 16> PhiSubRegs;
15745+
const unsigned SubRegIndices[] = {
15746+
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
15747+
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
15748+
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
15749+
AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
15750+
AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
15751+
AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
15752+
AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
15753+
AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
15754+
const unsigned NumSubRegs = Size / 32;
15755+
assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
15756+
"Not enough subregister indices");
15757+
for (unsigned I = 0; I != NumSubRegs; ++I) {
15758+
unsigned SubRegIdx = SubRegIndices[I];
15759+
Register InactiveSubReg =
15760+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15761+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
15762+
InactiveSubReg)
15763+
.addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
15764+
15765+
Register AllLanesSubReg =
15766+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15767+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15768+
TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
15769+
.addReg(NewPhiDest, 0, SubRegIdx)
15770+
.addReg(InactiveSubReg);
15771+
PhiSubRegs.push_back(AllLanesSubReg);
15772+
}
15773+
// Now we need to combine the subregisters into the original register.
15774+
auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15775+
TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
15776+
for (unsigned I = 0; I < NumSubRegs; ++I) {
15777+
RegSequence.addReg(PhiSubRegs[I]);
15778+
RegSequence.addImm(SubRegIndices[I]);
15779+
}
15780+
break;
15781+
}
15782+
}
15783+
15784+
Phi.removeOperand(InactiveOpIdx + 1);
15785+
Phi.removeOperand(InactiveOpIdx);
15786+
}
15787+
If.getParent()->removeSuccessor(EndBB);
15788+
15789+
BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
15790+
TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
15791+
? AMDGPU::S_MOV_B32
15792+
: AMDGPU::S_MOV_B64),
15793+
TII->getRegisterInfo().getExec())
15794+
.addImm(-1);
15795+
15796+
EndCf.eraseFromParent();
15797+
If.eraseFromParent();
15798+
IWWIt->eraseFromParent();
15799+
}
15800+
1567415801
// Figure out which registers should be reserved for stack access. Only after
1567515802
// the function is legalized do we know all of the non-spill stack objects or if
1567615803
// calls are present.
@@ -15681,6 +15808,12 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1568115808
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1568215809
const SIInstrInfo *TII = ST.getInstrInfo();
1568315810

15811+
if (Info->hasInitWholeWave()) {
15812+
assert(Info->isChainFunction() &&
15813+
"init.whole.wave may only be used in chain functions");
15814+
removeInitWholeWaveBranch(MF, MRI, TII);
15815+
}
15816+
1568415817
if (Info->isEntryFunction()) {
1568515818
// Callable functions have fixed registers used for stack access.
1568615819
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,14 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
575575
let Defs = [EXEC];
576576
}
577577

578+
// Sets EXEC to all lanes and returns the previous EXEC.
579+
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
580+
(outs SReg_1:$dst), (ins),
581+
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
582+
let Defs = [EXEC];
583+
let Uses = [EXEC];
584+
}
585+
578586
// Return for returning shaders to a shader variant epilog.
579587
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
580588
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
289289
StringValue SGPRForEXECCopy;
290290
StringValue LongBranchReservedReg;
291291

292+
bool HasInitWholeWave = false;
293+
292294
SIMachineFunctionInfo() = default;
293295
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
294296
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
336338
StringValue()); // Don't print out when it's empty.
337339
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
338340
StringValue());
341+
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
339342
}
340343
};
341344

0 commit comments

Comments
 (0)