Skip to content

Commit 3356208

Browse files
authored
Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108512)
This reverts commit 7792b4a. The problem was a conflict with e55d6f5 "[AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive (#107889)" which changed the syntax of V_SET_INACTIVE (and thus made my MIR test crash). ...if only we had a merge queue.
1 parent d4f6ad5 commit 3356208

22 files changed

+1530
-5
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,16 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
208208
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209209
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210210

211+
// Sets the function into whole-wave-mode and returns whether the lane was
212+
// active when entering the function. A branch depending on this return will
213+
// revert the EXEC mask to what it was when entering the function, thus
214+
// resulting in a no-op. This pattern is used to optimize branches when function
215+
// tails need to be run in whole-wave-mode. It may also have other consequences
216+
// (mostly related to WWM CSR handling) that differentiate it from using
217+
// a plain `amdgcn.init.exec -1`.
218+
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
219+
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
220+
211221
def int_amdgcn_wavefrontsize :
212222
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213223
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
27382738
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
27392739
SelectDSBvhStackIntrinsic(N);
27402740
return;
2741+
case Intrinsic::amdgcn_init_whole_wave:
2742+
CurDAG->getMachineFunction()
2743+
.getInfo<SIMachineFunctionInfo>()
2744+
->setInitWholeWave();
2745+
break;
27412746
}
27422747

27432748
SelectCode(N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
17721772
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
17731773
}
17741774

1775+
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1776+
MachineFunction *MF = MI.getParent()->getParent();
1777+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1778+
1779+
MFInfo->setInitWholeWave();
1780+
return selectImpl(MI, *CoverageInfo);
1781+
}
1782+
17751783
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
17761784
if (TM.getOptLevel() > CodeGenOptLevel::None) {
17771785
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20992107
return selectDSAppendConsume(I, true);
21002108
case Intrinsic::amdgcn_ds_consume:
21012109
return selectDSAppendConsume(I, false);
2110+
case Intrinsic::amdgcn_init_whole_wave:
2111+
return selectInitWholeWave(I);
21022112
case Intrinsic::amdgcn_s_barrier:
21032113
return selectSBarrier(I);
21042114
case Intrinsic::amdgcn_raw_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
120120
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
121121
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
122122
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
123+
bool selectInitWholeWave(MachineInstr &MI) const;
123124
bool selectSBarrier(MachineInstr &MI) const;
124125
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
125126

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6767
// Kernel may need limited waves per EU for better performance.
6868
bool WaveLimiter = false;
6969

70+
bool HasInitWholeWave = false;
71+
7072
public:
7173
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
7274

@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
109111
return WaveLimiter;
110112
}
111113

114+
bool hasInitWholeWave() const { return HasInitWholeWave; }
115+
void setInitWholeWave() { HasInitWholeWave = true; }
116+
112117
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
113118
return allocateLDSGlobal(DL, GV, DynLDSAlign);
114119
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49974997
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
49984998
break;
49994999
}
5000+
case Intrinsic::amdgcn_init_whole_wave:
50005001
case Intrinsic::amdgcn_live_mask: {
50015002
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
50025003
break;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
329329
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
330330
def : SourceOfDivergence<int_amdgcn_update_dpp>;
331331
def : SourceOfDivergence<int_amdgcn_writelane>;
332+
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
332333

333334
foreach intr = AMDGPUMFMAIntrinsics908 in
334335
def : SourceOfDivergence<intr>;

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1740,6 +1740,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
17401740
? DenormalMode::IEEE
17411741
: DenormalMode::PreserveSign;
17421742

1743+
if (YamlMFI.HasInitWholeWave)
1744+
MFI->setInitWholeWave();
1745+
17431746
return false;
17441747
}
17451748

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13431343

13441344
// Allocate spill slots for WWM reserved VGPRs.
13451345
// For chain functions, we only need to do this if we have calls to
1346-
// llvm.amdgcn.cs.chain.
1347-
bool IsChainWithoutCalls =
1348-
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349-
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1346+
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
1347+
// chain functions do not return) and the function did not contain a call to
1348+
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
1349+
// when entering the function).
1350+
bool IsChainWithoutRestores =
1351+
FuncInfo->isChainFunction() &&
1352+
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
1353+
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
13501354
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
13511355
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
13521356
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,16 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
570570
let Defs = [EXEC];
571571
}
572572

573+
// Sets EXEC to all lanes and returns the previous EXEC.
574+
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
575+
(outs SReg_1:$dst), (ins),
576+
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
577+
let Defs = [EXEC];
578+
let Uses = [EXEC];
579+
580+
let isConvergent = 1;
581+
}
582+
573583
// Return for returning shaders to a shader variant epilog.
574584
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
575585
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
295295
StringValue SGPRForEXECCopy;
296296
StringValue LongBranchReservedReg;
297297

298+
bool HasInitWholeWave = false;
299+
298300
SIMachineFunctionInfo() = default;
299301
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
300302
const TargetRegisterInfo &TRI,
@@ -342,6 +344,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
342344
StringValue()); // Don't print out when it's empty.
343345
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
344346
StringValue());
347+
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
345348
}
346349
};
347350

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -586,7 +586,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
586586
KillInstrs.push_back(&MI);
587587
BBI.NeedsLowering = true;
588588
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
589-
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
589+
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
590+
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
590591
InitExecInstrs.push_back(&MI);
591592
} else if (WQMOutputs) {
592593
// The function is in machine SSA form, which means that physical
@@ -1571,6 +1572,33 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
15711572
MachineBasicBlock *MBB = MI.getParent();
15721573
bool IsWave32 = ST->isWave32();
15731574

1575+
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1576+
assert(MBB == &MBB->getParent()->front() &&
1577+
"init whole wave not in entry block");
1578+
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1579+
MachineInstr *SaveExec =
1580+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1581+
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1582+
: AMDGPU::S_OR_SAVEEXEC_B64),
1583+
EntryExec)
1584+
.addImm(-1);
1585+
1586+
// Replace all uses of MI's destination reg with EntryExec.
1587+
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1588+
1589+
if (LIS) {
1590+
LIS->RemoveMachineInstrFromMaps(MI);
1591+
}
1592+
1593+
MI.eraseFromParent();
1594+
1595+
if (LIS) {
1596+
LIS->InsertMachineInstrInMaps(*SaveExec);
1597+
LIS->createAndComputeVirtRegInterval(EntryExec);
1598+
}
1599+
return;
1600+
}
1601+
15741602
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
15751603
// This should be before all vector instructions.
15761604
MachineInstr *InitMI =

0 commit comments

Comments
 (0)