Skip to content

Commit f79595f

Browse files
committed
Address review comments
1 parent 01b87dc commit f79595f

File tree

7 files changed

+797
-1305
lines changed

7 files changed

+797
-1305
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,11 +216,10 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
216216
// (mostly related to WWM CSR handling) that differentiate it from using
217217
// a plain `amdgcn.init.exec -1`.
218218
//
219-
// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
220219
// Using this intrinsic without immediately branching on its return value is an
221220
// error.
222221
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
223-
IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
222+
IntrHasSideEffects, IntrNoMem, IntrConvergent]>;
224223

225224
def int_amdgcn_wavefrontsize :
226225
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 0 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -15677,133 +15677,6 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
1567715677
}
1567815678
}
1567915679

15680-
static void removeInitWholeWaveBranch(MachineFunction &MF,
15681-
MachineRegisterInfo &MRI,
15682-
const SIInstrInfo *TII) {
15683-
// Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
15684-
// EXEC to -1 at SI_END_CF.
15685-
auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
15686-
return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
15687-
});
15688-
if (IWWIt == MF.begin()->instr_end())
15689-
return; // We've been here before (GISel runs finalizeLowering twice).
15690-
15691-
MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
15692-
assert(If.getOpcode() == AMDGPU::SI_IF &&
15693-
"Unexpected user for init.whole.wave result");
15694-
assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
15695-
"Expected simple control flow");
15696-
15697-
MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
15698-
MachineBasicBlock *EndBB = EndCf.getParent();
15699-
15700-
// Update all the Phis: since we're removing a predecessor, we need to remove
15701-
// the corresponding pair of operands. However, we can't just drop the value
15702-
// coming from the 'if' block - that's going to be the value of the inactive
15703-
// lanes.
15704-
// %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
15705-
// should become
15706-
// %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
15707-
// %v = v_set_inactive %t, %inactive
15708-
// Note that usually EndCf will be the first instruction after the phis and as
15709-
// such will serve as the end of the range when iterating over phis.
15710-
// Therefore, we shouldn't introduce any new instructions before it.
15711-
const SIRegisterInfo &TRI = TII->getRegisterInfo();
15712-
auto AfterEndCf = std::next(EndCf.getIterator());
15713-
for (auto &Phi : EndBB->phis()) {
15714-
Register PhiDest = Phi.getOperand(0).getReg();
15715-
const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
15716-
15717-
Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
15718-
Phi.getOperand(0).setReg(NewPhiDest);
15719-
15720-
unsigned InactiveOpIdx = 0;
15721-
for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
15722-
if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
15723-
InactiveOpIdx = I;
15724-
break;
15725-
}
15726-
}
15727-
assert(InactiveOpIdx != 0 && "Broken phi?");
15728-
15729-
// At this point, the register class could be larger than 32 or 64, so we
15730-
// might have to use more than one V_SET_INACTIVE instruction.
15731-
unsigned Size = TRI.getRegSizeInBits(*PhiRC);
15732-
switch (Size) {
15733-
case 32:
15734-
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15735-
TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
15736-
.addReg(NewPhiDest)
15737-
.add(Phi.getOperand(InactiveOpIdx));
15738-
break;
15739-
case 64:
15740-
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15741-
TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
15742-
.addReg(NewPhiDest)
15743-
.add(Phi.getOperand(InactiveOpIdx));
15744-
break;
15745-
default: {
15746-
// For each 32-bit subregister of the register at InactiveOpIdx, insert
15747-
// a COPY to a new register, and a V_SET_INACTIVE_B32 using the
15748-
// corresponding subregisters of PhiDest and NewPhiDest.
15749-
// FIXME: There has to be a better way to iterate over this...
15750-
llvm::SmallVector<Register, 16> PhiSubRegs;
15751-
const unsigned SubRegIndices[] = {
15752-
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
15753-
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
15754-
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
15755-
AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
15756-
AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
15757-
AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
15758-
AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
15759-
AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
15760-
const unsigned NumSubRegs = Size / 32;
15761-
assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
15762-
"Not enough subregister indices");
15763-
for (unsigned I = 0; I != NumSubRegs; ++I) {
15764-
unsigned SubRegIdx = SubRegIndices[I];
15765-
Register InactiveSubReg =
15766-
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15767-
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
15768-
InactiveSubReg)
15769-
.addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
15770-
15771-
Register AllLanesSubReg =
15772-
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15773-
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15774-
TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
15775-
.addReg(NewPhiDest, 0, SubRegIdx)
15776-
.addReg(InactiveSubReg);
15777-
PhiSubRegs.push_back(AllLanesSubReg);
15778-
}
15779-
// Now we need to combine the subregisters into the original register.
15780-
auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15781-
TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
15782-
for (unsigned I = 0; I < NumSubRegs; ++I) {
15783-
RegSequence.addReg(PhiSubRegs[I]);
15784-
RegSequence.addImm(SubRegIndices[I]);
15785-
}
15786-
break;
15787-
}
15788-
}
15789-
15790-
Phi.removeOperand(InactiveOpIdx + 1);
15791-
Phi.removeOperand(InactiveOpIdx);
15792-
}
15793-
If.getParent()->removeSuccessor(EndBB);
15794-
15795-
BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
15796-
TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
15797-
? AMDGPU::S_MOV_B32
15798-
: AMDGPU::S_MOV_B64),
15799-
TII->getRegisterInfo().getExec())
15800-
.addImm(-1);
15801-
15802-
EndCf.eraseFromParent();
15803-
If.eraseFromParent();
15804-
IWWIt->eraseFromParent();
15805-
}
15806-
1580715680
// Figure out which registers should be reserved for stack access. Only after
1580815681
// the function is legalized do we know all of the non-spill stack objects or if
1580915682
// calls are present.
@@ -15814,12 +15687,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1581415687
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1581515688
const SIInstrInfo *TII = ST.getInstrInfo();
1581615689

15817-
if (Info->hasInitWholeWave()) {
15818-
assert(Info->isChainFunction() &&
15819-
"init.whole.wave may only be used in chain functions");
15820-
removeInitWholeWaveBranch(MF, MRI, TII);
15821-
}
15822-
1582315690
if (Info->isEntryFunction()) {
1582415691
// Callable functions have fixed registers used for stack access.
1582515692
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,8 @@ def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
589589
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
590590
let Defs = [EXEC];
591591
let Uses = [EXEC];
592+
593+
let isConvergent = 1;
592594
}
593595

594596
// Return for returning shaders to a shader variant epilog.

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
594594
KillInstrs.push_back(&MI);
595595
BBI.NeedsLowering = true;
596596
} else if (Opcode == AMDGPU::SI_INIT_EXEC ||
597-
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) {
597+
Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT ||
598+
Opcode == AMDGPU::SI_INIT_WHOLE_WAVE) {
598599
InitExecInstrs.push_back(&MI);
599600
} else if (WQMOutputs) {
600601
// The function is in machine SSA form, which means that physical
@@ -1582,6 +1583,28 @@ void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) {
15821583
MachineBasicBlock *MBB = MI.getParent();
15831584
bool IsWave32 = ST->isWave32();
15841585

1586+
if (MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE) {
1587+
// TODO: Assert that it's in the entry block
1588+
Register EntryExec = MRI->createVirtualRegister(TRI->getBoolRC());
1589+
MachineInstr *SaveExec =
1590+
BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
1591+
TII->get(IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32
1592+
: AMDGPU::S_OR_SAVEEXEC_B64),
1593+
EntryExec)
1594+
.addImm(-1);
1595+
1596+
// Replace all uses of MI's destination reg with EntryExec.
1597+
MRI->replaceRegWith(MI.getOperand(0).getReg(), EntryExec);
1598+
MI.eraseFromParent();
1599+
1600+
if (LIS) {
1601+
LIS->RemoveMachineInstrFromMaps(MI);
1602+
LIS->InsertMachineInstrInMaps(*SaveExec);
1603+
LIS->createAndComputeVirtRegInterval(EntryExec);
1604+
}
1605+
return;
1606+
}
1607+
15851608
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
15861609
// This should be before all vector instructions.
15871610
MachineInstr *InitMI =

0 commit comments

Comments
 (0)