Skip to content

Revert "Reland "[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic" (#108054)"" #108341

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -208,16 +208,6 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;

// Sets the function into whole-wave-mode and returns whether the lane was
// active when entering the function. A branch depending on this return will
// revert the EXEC mask to what it was when entering the function, thus
// resulting in a no-op. This pattern is used to optimize branches when function
// tails need to be run in whole-wave-mode. It may also have other consequences
// (mostly related to WWM CSR handling) that differentiate it from using
// a plain `amdgcn.init.exec -1`.
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;

def int_amdgcn_wavefrontsize :
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
Expand Down
5 changes: 0 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2738,11 +2738,6 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
SelectDSBvhStackIntrinsic(N);
return;
case Intrinsic::amdgcn_init_whole_wave:
CurDAG->getMachineFunction()
.getInfo<SIMachineFunctionInfo>()
->setInitWholeWave();
break;
}

SelectCode(N);
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1772,14 +1772,6 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}

bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
MachineFunction *MF = MI.getParent()->getParent();
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();

MFInfo->setInitWholeWave();
return selectImpl(MI, *CoverageInfo);
}

bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
Expand Down Expand Up @@ -2107,8 +2099,6 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
return selectDSAppendConsume(I, true);
case Intrinsic::amdgcn_ds_consume:
return selectDSAppendConsume(I, false);
case Intrinsic::amdgcn_init_whole_wave:
return selectInitWholeWave(I);
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,6 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
bool selectInitWholeWave(MachineInstr &MI) const;
bool selectSBarrier(MachineInstr &MI) const;
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;

Expand Down
5 changes: 0 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
// Kernel may need limited waves per EU for better performance.
bool WaveLimiter = false;

bool HasInitWholeWave = false;

public:
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);

Expand Down Expand Up @@ -111,9 +109,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return WaveLimiter;
}

bool hasInitWholeWave() const { return HasInitWholeWave; }
void setInitWholeWave() { HasInitWholeWave = true; }

unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
return allocateLDSGlobal(DL, GV, DynLDSAlign);
}
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4997,7 +4997,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
break;
}
case Intrinsic::amdgcn_init_whole_wave:
case Intrinsic::amdgcn_live_mask: {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
break;
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
def : SourceOfDivergence<int_amdgcn_writelane>;
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;

foreach intr = AMDGPUMFMAIntrinsics908 in
def : SourceOfDivergence<intr>;
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1740,9 +1740,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
? DenormalMode::IEEE
: DenormalMode::PreserveSign;

if (YamlMFI.HasInitWholeWave)
MFI->setInitWholeWave();

return false;
}

Expand Down
12 changes: 4 additions & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1343,14 +1343,10 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(

// Allocate spill slots for WWM reserved VGPRs.
// For chain functions, we only need to do this if we have calls to
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
// chain functions do not return) and the function did not contain a call to
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
// when entering the function).
bool IsChainWithoutRestores =
FuncInfo->isChainFunction() &&
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
// llvm.amdgcn.cs.chain.
bool IsChainWithoutCalls =
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
Expand Down
10 changes: 0 additions & 10 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -570,16 +570,6 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
let Defs = [EXEC];
}

// Sets EXEC to all lanes and returns the previous EXEC.
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
(outs SReg_1:$dst), (ins),
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
let Defs = [EXEC];
let Uses = [EXEC];

let isConvergent = 1;
}

// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
Expand Down
3 changes: 0 additions & 3 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
StringValue SGPRForEXECCopy;
StringValue LongBranchReservedReg;

bool HasInitWholeWave = false;

SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
const TargetRegisterInfo &TRI,
Expand Down Expand Up @@ -344,7 +342,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue()); // Don't print out when it's empty.
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
StringValue());
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
}
};

Expand Down
30 changes: 1 addition & 29 deletions llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
KillInstrs.push_back(&MI);
BBI.NeedsLowering = true;
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
InitExecInstrs.push_back(&MI);
} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical
Expand Down Expand Up @@ -1572,33 +1571,6 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
MachineBasicBlock *MBB = MI.getParent();
bool IsWave32 = ST->isWave32();

if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
assert(MBB == &MBB->getParent()->front() &&
"init whole wave not in entry block");
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
MachineInstr *SaveExec =
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64),
EntryExec)
.addImm(-1);

// Replace all uses of MI's destination reg with EntryExec.
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);

if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
}

MI.eraseFromParent();

if (LIS) {
LIS->InsertMachineInstrInMaps(*SaveExec);
LIS->createAndComputeVirtRegInterval(EntryExec);
}
return;
}

if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
MachineInstr *InitMI =
Expand Down
Loading
Loading