Skip to content

Commit d92f149

Browse files
authored
Revert "Revert "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)"
This reverts commit c7a7767.
1 parent c7a7767 commit d92f149

22 files changed

+1524
-5
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,16 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
208208
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209209
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210210

211+
// Sets the function into whole-wave-mode and returns whether the lane was
212+
// active when entering the function. A branch depending on this return will
213+
// revert the EXEC mask to what it was when entering the function, thus
214+
// resulting in a no-op. This pattern is used to optimize branches when function
215+
// tails need to be run in whole-wave-mode. It may also have other consequences
216+
// (mostly related to WWM CSR handling) that differentiate it from using
217+
// a plain `amdgcn.init.exec -1`.
218+
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
219+
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
220+
211221
def int_amdgcn_wavefrontsize :
212222
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213223
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
27382738
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
27392739
SelectDSBvhStackIntrinsic(N);
27402740
return;
2741+
case Intrinsic::amdgcn_init_whole_wave:
2742+
CurDAG->getMachineFunction()
2743+
.getInfo<SIMachineFunctionInfo>()
2744+
->setInitWholeWave();
2745+
break;
27412746
}
27422747

27432748
SelectCode(N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
17721772
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
17731773
}
17741774

1775+
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1776+
MachineFunction *MF = MI.getParent()->getParent();
1777+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1778+
1779+
MFInfo->setInitWholeWave();
1780+
return selectImpl(MI, *CoverageInfo);
1781+
}
1782+
17751783
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
17761784
if (TM.getOptLevel() > CodeGenOptLevel::None) {
17771785
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20992107
return selectDSAppendConsume(I, true);
21002108
case Intrinsic::amdgcn_ds_consume:
21012109
return selectDSAppendConsume(I, false);
2110+
case Intrinsic::amdgcn_init_whole_wave:
2111+
return selectInitWholeWave(I);
21022112
case Intrinsic::amdgcn_s_barrier:
21032113
return selectSBarrier(I);
21042114
case Intrinsic::amdgcn_raw_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
120120
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
121121
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
122122
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
123+
bool selectInitWholeWave(MachineInstr &MI) const;
123124
bool selectSBarrier(MachineInstr &MI) const;
124125
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
125126

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6767
// Kernel may need limited waves per EU for better performance.
6868
bool WaveLimiter = false;
6969

70+
bool HasInitWholeWave = false;
71+
7072
public:
7173
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
7274

@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
109111
return WaveLimiter;
110112
}
111113

114+
bool hasInitWholeWave() const { return HasInitWholeWave; }
115+
void setInitWholeWave() { HasInitWholeWave = true; }
116+
112117
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
113118
return allocateLDSGlobal(DL, GV, DynLDSAlign);
114119
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49974997
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
49984998
break;
49994999
}
5000+
case Intrinsic::amdgcn_init_whole_wave:
50005001
case Intrinsic::amdgcn_live_mask: {
50015002
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
50025003
break;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
329329
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
330330
def : SourceOfDivergence<int_amdgcn_update_dpp>;
331331
def : SourceOfDivergence<int_amdgcn_writelane>;
332+
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
332333

333334
foreach intr = AMDGPUMFMAIntrinsics908 in
334335
def : SourceOfDivergence<intr>;

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
17391739
? DenormalMode::IEEE
17401740
: DenormalMode::PreserveSign;
17411741

1742+
if (YamlMFI.HasInitWholeWave)
1743+
MFI->setInitWholeWave();
1744+
17421745
return false;
17431746
}
17441747

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13431343

13441344
// Allocate spill slots for WWM reserved VGPRs.
13451345
// For chain functions, we only need to do this if we have calls to
1346-
// llvm.amdgcn.cs.chain.
1347-
bool IsChainWithoutCalls =
1348-
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349-
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1346+
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
1347+
// chain functions do not return) and the function did not contain a call to
1348+
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
1349+
// when entering the function).
1350+
bool IsChainWithoutRestores =
1351+
FuncInfo->isChainFunction() &&
1352+
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
1353+
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
13501354
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
13511355
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
13521356
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,16 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
583583
let Defs = [EXEC];
584584
}
585585

586+
// Sets EXEC to all lanes and returns the previous EXEC.
587+
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
588+
(outs SReg_1:$dst), (ins),
589+
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
590+
let Defs = [EXEC];
591+
let Uses = [EXEC];
592+
593+
let isConvergent = 1;
594+
}
595+
586596
// Return for returning shaders to a shader variant epilog.
587597
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
588598
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
289289
StringValue SGPRForEXECCopy;
290290
StringValue LongBranchReservedReg;
291291

292+
bool HasInitWholeWave = false;
293+
292294
SIMachineFunctionInfo() = default;
293295
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
294296
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
336338
StringValue()); // Don't print out when it's empty.
337339
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
338340
StringValue());
341+
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
339342
}
340343
};
341344

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
594594
KillInstrs.push_back(&MI);
595595
BBI.NeedsLowering = true;
596596
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
597-
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
597+
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
598+
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
598599
InitExecInstrs.push_back(&MI);
599600
} else if (WQMOutputs) {
600601
// The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,29 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
15821583
MachineBasicBlock *MBB = MI.getParent();
15831584
bool IsWave32 = ST->isWave32();
15841585

1586+
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1587+
assert(MBB == &MBB->getParent()->front() &&
1588+
"init whole wave not in entry block");
1589+
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1590+
MachineInstr *SaveExec =
1591+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1592+
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1593+
: AMDGPU::S_OR_SAVEEXEC_B64),
1594+
EntryExec)
1595+
.addImm(-1);
1596+
1597+
// Replace all uses of MI's destination reg with EntryExec.
1598+
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1599+
MI.eraseFromParent();
1600+
1601+
if (LIS) {
1602+
LIS->RemoveMachineInstrFromMaps(MI);
1603+
LIS->InsertMachineInstrInMaps(*SaveExec);
1604+
LIS->createAndComputeVirtRegInterval(EntryExec);
1605+
}
1606+
return;
1607+
}
1608+
15851609
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
15861610
// This should be before all vector instructions.
15871611
MachineInstr *InitMI =

0 commit comments

Comments
 (0)