Skip to content

Commit 6dd26d4

Browse files
AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize
Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr.
1 parent fcd0dc7 commit 6dd26d4

17 files changed

+512
-240
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -117,45 +117,73 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
117117
return LLT::scalar(32);
118118
}
119119

120-
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
121-
const RegisterBankInfo &RBI);
120+
typedef std::function<MachineInstrBuilder(MachineIRBuilder &, Register,
121+
Register)>
122+
ReadLaneFnTy;
123+
124+
static Register buildReadLane(MachineIRBuilder &, Register,
125+
const RegisterBankInfo &, ReadLaneFnTy);
122126

123127
static void unmergeReadAnyLane(MachineIRBuilder &B,
124128
SmallVectorImpl<Register> &SgprDstParts,
125129
LLT UnmergeTy, Register VgprSrc,
126-
const RegisterBankInfo &RBI) {
130+
const RegisterBankInfo &RBI,
131+
ReadLaneFnTy BuildRL) {
127132
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
128133
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
129134
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
130-
SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
135+
SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
131136
}
132137
}
133138

134-
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
135-
const RegisterBankInfo &RBI) {
139+
static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
140+
const RegisterBankInfo &RBI,
141+
ReadLaneFnTy BuildRL) {
136142
LLT Ty = B.getMRI()->getType(VgprSrc);
137143
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
138144
if (Ty.getSizeInBits() == 32) {
139-
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
140-
.getReg(0);
145+
Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
146+
return BuildRL(B, SgprDst, VgprSrc).getReg(0);
141147
}
142148

143149
SmallVector<Register, 8> SgprDstParts;
144-
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
150+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
151+
BuildRL);
145152

146153
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
147154
}
148155

149-
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
150-
Register VgprSrc, const RegisterBankInfo &RBI) {
156+
static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
157+
Register VgprSrc, const RegisterBankInfo &RBI,
158+
ReadLaneFnTy BuildReadLane) {
151159
LLT Ty = B.getMRI()->getType(VgprSrc);
152160
if (Ty.getSizeInBits() == 32) {
153-
B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
161+
BuildReadLane(B, SgprDst, VgprSrc);
154162
return;
155163
}
156164

157165
SmallVector<Register, 8> SgprDstParts;
158-
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
166+
unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
167+
BuildReadLane);
159168

160169
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
161170
}
171+
172+
void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
173+
Register VgprSrc, const RegisterBankInfo &RBI) {
174+
return buildReadLane(
175+
B, SgprDst, VgprSrc, RBI,
176+
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
177+
return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
178+
});
179+
}
180+
181+
void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
182+
Register VgprSrc, const RegisterBankInfo &RBI) {
183+
return buildReadLane(
184+
B, SgprDst, VgprSrc, RBI,
185+
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
186+
return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
187+
.addReg(VgprSrc);
188+
});
189+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ class IntrinsicLaneMaskAnalyzer {
5151

5252
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
5353
const RegisterBankInfo &RBI);
54+
void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
55+
const RegisterBankInfo &RBI);
5456
}
5557
}
5658

llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp

Lines changed: 238 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,226 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
5757
lower(MI, Mapping, WaterfallSgprs);
5858
}
5959

60+
bool RegBankLegalizeHelper::executeInWaterfallLoop(
61+
MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
62+
SmallSet<Register, 4> &SGPROperandRegs) {
63+
// Track use registers which have already been expanded with a readfirstlane
64+
// sequence. This may have multiple uses if moving a sequence.
65+
DenseMap<Register, Register> WaterfalledRegMap;
66+
67+
MachineBasicBlock &MBB = B.getMBB();
68+
MachineFunction &MF = B.getMF();
69+
70+
const SIRegisterInfo *TRI = ST.getRegisterInfo();
71+
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
72+
unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73+
if (ST.isWave32()) {
74+
MovExecOpc = AMDGPU::S_MOV_B32;
75+
MovExecTermOpc = AMDGPU::S_MOV_B32_term;
76+
XorTermOpc = AMDGPU::S_XOR_B32_term;
77+
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
78+
ExecReg = AMDGPU::EXEC_LO;
79+
} else {
80+
MovExecOpc = AMDGPU::S_MOV_B64;
81+
MovExecTermOpc = AMDGPU::S_MOV_B64_term;
82+
XorTermOpc = AMDGPU::S_XOR_B64_term;
83+
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
84+
ExecReg = AMDGPU::EXEC;
85+
}
86+
87+
#ifndef NDEBUG
88+
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
89+
#endif
90+
91+
MachineRegisterInfo &MRI = *B.getMRI();
92+
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
93+
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
94+
95+
// Don't bother using generic instructions/registers for the exec mask.
96+
B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
97+
98+
Register SavedExec = MRI.createVirtualRegister(WaveRC);
99+
100+
// To insert the loop we need to split the block. Move everything before
101+
// this point to a new block, and insert a new empty block before this
102+
// instruction.
103+
MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
104+
MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
105+
MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
106+
MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
107+
MachineFunction::iterator MBBI(MBB);
108+
++MBBI;
109+
MF.insert(MBBI, LoopBB);
110+
MF.insert(MBBI, BodyBB);
111+
MF.insert(MBBI, RestoreExecBB);
112+
MF.insert(MBBI, RemainderBB);
113+
114+
LoopBB->addSuccessor(BodyBB);
115+
BodyBB->addSuccessor(RestoreExecBB);
116+
BodyBB->addSuccessor(LoopBB);
117+
118+
// Move the rest of the block into a new block.
119+
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
120+
RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
121+
122+
MBB.addSuccessor(LoopBB);
123+
RestoreExecBB->addSuccessor(RemainderBB);
124+
125+
B.setInsertPt(*LoopBB, LoopBB->end());
126+
127+
// +-MBB:------------+
128+
// | ... |
129+
// | %0 = G_INST_1 |
130+
// | %Dst = MI %Vgpr |
131+
// | %1 = G_INST_2 |
132+
// | ... |
133+
// +-----------------+
134+
// ->
135+
// +-MBB-------------------------------+
136+
// | ... |
137+
// | %0 = G_INST_1 |
138+
// | %SaveExecReg = S_MOV_B32 $exec_lo |
139+
// +----------------|------------------+
140+
// | /------------------------------|
141+
// V V |
142+
// +-LoopBB---------------------------------------------------------------+ |
143+
// | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
144+
// | instead of executing for each lane, see if other lanes had | |
145+
// | same value for %Vgpr and execute for them also. | |
146+
// | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
147+
// | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
148+
// | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
149+
// | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
150+
// +----------------|-----------------------------------------------------+ |
151+
// V |
152+
// +-BodyBB------------------------------------------------------------+ |
153+
// | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
154+
// | executed only for active lanes and written to Dst | |
155+
// | $exec = S_XOR_B32 $exec, %SavedExec | |
156+
// | set active lanes to 0 in SavedExec, lanes that did not write to | |
157+
// | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
158+
// | SI_WATERFALL_LOOP LoopBB |-----|
159+
// +----------------|--------------------------------------------------+
160+
// V
161+
// +-RestoreExecBB--------------------------+
162+
// | $exec_lo = S_MOV_B32_term %SaveExecReg |
163+
// +----------------|-----------------------+
164+
// V
165+
// +-RemainderBB:----------------------+
166+
// | %1 = G_INST_2 |
167+
// | ... |
168+
// +---------------------------------- +
169+
170+
// Move the instruction into the loop body. Note we moved everything after
171+
// Range.end() already into a new block, so Range.end() is no longer valid.
172+
BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
173+
174+
// Figure out the iterator range after splicing the instructions.
175+
MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
176+
auto NewEnd = BodyBB->end();
177+
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
178+
179+
B.setMBB(*LoopBB);
180+
Register CondReg;
181+
182+
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
183+
for (MachineOperand &Op : MI.all_uses()) {
184+
Register OldReg = Op.getReg();
185+
if (!SGPROperandRegs.count(OldReg))
186+
continue;
187+
188+
// See if we already processed this register in another instruction in
189+
// the sequence.
190+
auto OldVal = WaterfalledRegMap.find(OldReg);
191+
if (OldVal != WaterfalledRegMap.end()) {
192+
Op.setReg(OldVal->second);
193+
continue;
194+
}
195+
196+
Register OpReg = Op.getReg();
197+
LLT OpTy = MRI.getType(OpReg);
198+
199+
// TODO: support for agpr
200+
assert(MRI.getRegBank(OpReg) == VgprRB);
201+
Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
202+
buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
203+
204+
// Build the comparison(s), CurrentLaneReg == OpReg.
205+
unsigned OpSize = OpTy.getSizeInBits();
206+
bool Is64 = OpSize % 64 == 0;
207+
unsigned PartSize = Is64 ? 64 : 32;
208+
LLT PartTy = LLT::scalar(PartSize);
209+
unsigned NumParts = OpSize / PartSize;
210+
SmallVector<Register, 8> OpParts;
211+
SmallVector<Register, 8> CurrentLaneParts;
212+
213+
if (NumParts == 1) {
214+
OpParts.push_back(OpReg);
215+
CurrentLaneParts.push_back(CurrentLaneReg);
216+
} else {
217+
auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
218+
auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
219+
for (unsigned i = 0; i < NumParts; ++i) {
220+
OpParts.push_back(UnmergeOp.getReg(i));
221+
CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
222+
}
223+
}
224+
225+
for (unsigned i = 0; i < NumParts; ++i) {
226+
Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
227+
B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
228+
229+
if (!CondReg) {
230+
CondReg = CmpReg;
231+
} else {
232+
CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
233+
}
234+
}
235+
236+
Op.setReg(CurrentLaneReg);
237+
238+
// Make sure we don't re-process this register again.
239+
WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
240+
}
241+
}
242+
243+
// Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
244+
Register CondRegLM =
245+
MRI.createVirtualRegister({WaveRC, LLT::scalar(ST.isWave32() ? 32 : 64)});
246+
B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
247+
248+
// Update EXEC, save the original EXEC value to SavedExec.
249+
B.buildInstr(AndSaveExecOpc)
250+
.addDef(SavedExec)
251+
.addReg(CondRegLM, RegState::Kill);
252+
MRI.setSimpleHint(SavedExec, CondRegLM);
253+
254+
B.setInsertPt(*BodyBB, BodyBB->end());
255+
256+
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
257+
B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
258+
259+
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
260+
// s_cbranch_scc0?
261+
262+
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
263+
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
264+
265+
// Save the EXEC mask before the loop.
266+
B.setInsertPt(MBB, MBB.end());
267+
B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
268+
269+
// Restore the EXEC mask after the loop.
270+
B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
271+
B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
272+
273+
// Set the insert point after the original instruction, so any new
274+
// instructions will be in the remainder.
275+
B.setInsertPt(*RemainderBB, RemainderBB->begin());
276+
277+
return true;
278+
}
279+
60280
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
61281
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
62282
MachineFunction &MF = B.getMF();
@@ -395,7 +615,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
395615

396616
switch (Mapping.LoweringMethod) {
397617
case DoNotLower:
398-
return;
618+
break;
399619
case VccExtToSel:
400620
return lowerVccExtToSel(MI);
401621
case UniExtToSel: {
@@ -531,7 +751,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
531751
}
532752
}
533753

534-
// TODO: executeInWaterfallLoop(... WaterfallSgprs)
754+
if (!WaterfallSgprs.empty()) {
755+
MachineBasicBlock::iterator I = MI.getIterator();
756+
executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
757+
}
535758
}
536759

537760
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -543,6 +766,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
543766
case Vgpr16:
544767
return LLT::scalar(16);
545768
case Sgpr32:
769+
case Sgpr32_W:
546770
case Sgpr32Trunc:
547771
case Sgpr32AExt:
548772
case Sgpr32AExtBoolInReg:
@@ -578,6 +802,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
578802
case VgprV2S32:
579803
return LLT::fixed_vector(2, 32);
580804
case SgprV4S32:
805+
case SgprV4S32_W:
581806
case VgprV4S32:
582807
case UniInVgprV4S32:
583808
return LLT::fixed_vector(4, 32);
@@ -645,6 +870,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
645870
return VccRB;
646871
case Sgpr16:
647872
case Sgpr32:
873+
case Sgpr32_W:
648874
case Sgpr64:
649875
case SgprP1:
650876
case SgprP3:
@@ -653,6 +879,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
653879
case SgprV2S16:
654880
case SgprV2S32:
655881
case SgprV4S32:
882+
case SgprV4S32_W:
656883
case SgprB32:
657884
case SgprB64:
658885
case SgprB96:
@@ -894,6 +1121,15 @@ void RegBankLegalizeHelper::applyMappingSrc(
8941121
}
8951122
break;
8961123
}
1124+
// sgpr waterfall, scalars and vectors
1125+
case Sgpr32_W:
1126+
case SgprV4S32_W: {
1127+
assert(Ty == getTyFromID(MethodIDs[i]));
1128+
if (RB != SgprRB) {
1129+
SgprWaterfallOperandRegs.insert(Reg);
1130+
}
1131+
break;
1132+
}
8971133
// sgpr and vgpr scalars with extend
8981134
case Sgpr32AExt: {
8991135
// Note: this ext allows S1, and it is meant to be combined away.

0 commit comments

Comments
 (0)