-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize #142790
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/petar-avramovic/ral-combine
Are you sure you want to change the base?
AMDGPU/GlobalISel: Add waterfall lowering in regbanklegalize #142790
Conversation
Warning This pull request is not mergeable via GitHub because a downstack PR is open. Once all requirements are satisfied, merge this PR as a stack on Graphite.
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesAdd rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering Patch is 89.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142790.diff 17 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 00979f44f9d34..b3edb959e14c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -117,45 +117,73 @@ static LLT getReadAnyLaneSplitTy(LLT Ty) {
return LLT::scalar(32);
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI);
+typedef std::function<MachineInstrBuilder(MachineIRBuilder &, Register,
+ Register)>
+ ReadLaneFnTy;
+
+static Register buildReadLane(MachineIRBuilder &, Register,
+ const RegisterBankInfo &, ReadLaneFnTy);
static void unmergeReadAnyLane(MachineIRBuilder &B,
SmallVectorImpl<Register> &SgprDstParts,
LLT UnmergeTy, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
const RegisterBank *VgprRB = &RBI.getRegBank(AMDGPU::VGPRRegBankID);
auto Unmerge = B.buildUnmerge({VgprRB, UnmergeTy}, VgprSrc);
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
- SgprDstParts.push_back(buildReadAnyLane(B, Unmerge.getReg(i), RBI));
+ SgprDstParts.push_back(buildReadLane(B, Unmerge.getReg(i), RBI, BuildRL));
}
}
-static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc,
- const RegisterBankInfo &RBI) {
+static Register buildReadLane(MachineIRBuilder &B, Register VgprSrc,
+ const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildRL) {
LLT Ty = B.getMRI()->getType(VgprSrc);
const RegisterBank *SgprRB = &RBI.getRegBank(AMDGPU::SGPRRegBankID);
if (Ty.getSizeInBits() == 32) {
- return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {{SgprRB, Ty}}, {VgprSrc})
- .getReg(0);
+ Register SgprDst = B.getMRI()->createVirtualRegister({SgprRB, Ty});
+ return BuildRL(B, SgprDst, VgprSrc).getReg(0);
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildRL);
return B.buildMergeLikeInstr({SgprRB, Ty}, SgprDstParts).getReg(0);
}
-void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
- Register VgprSrc, const RegisterBankInfo &RBI) {
+static void buildReadLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI,
+ ReadLaneFnTy BuildReadLane) {
LLT Ty = B.getMRI()->getType(VgprSrc);
if (Ty.getSizeInBits() == 32) {
- B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ BuildReadLane(B, SgprDst, VgprSrc);
return;
}
SmallVector<Register, 8> SgprDstParts;
- unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI);
+ unmergeReadAnyLane(B, SgprDstParts, getReadAnyLaneSplitTy(Ty), VgprSrc, RBI,
+ BuildReadLane);
B.buildMergeLikeInstr(SgprDst, SgprDstParts).getReg(0);
}
+
+void AMDGPU::buildReadAnyLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildInstr(AMDGPU::G_AMDGPU_READANYLANE, {SgprDst}, {VgprSrc});
+ });
+}
+
+void AMDGPU::buildReadFirstLane(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc, const RegisterBankInfo &RBI) {
+ return buildReadLane(
+ B, SgprDst, VgprSrc, RBI,
+ [](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) {
+ return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst)
+ .addReg(VgprSrc);
+ });
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 0c89bb5cc6100..5e1000ee0ab26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -51,6 +51,8 @@ class IntrinsicLaneMaskAnalyzer {
void buildReadAnyLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
const RegisterBankInfo &RBI);
+void buildReadFirstLane(MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ const RegisterBankInfo &RBI);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 7ff822c6f6580..989130a20559d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -57,6 +57,226 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
lower(MI, Mapping, WaterfallSgprs);
}
+bool RegBankLegalizeHelper::executeInWaterfallLoop(
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) {
+ // Track use registers which have already been expanded with a readfirstlane
+ // sequence. This may have multiple uses if moving a sequence.
+ DenseMap<Register, Register> WaterfalledRegMap;
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction &MF = B.getMF();
+
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
+ if (ST.isWave32()) {
+ MovExecOpc = AMDGPU::S_MOV_B32;
+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
+ XorTermOpc = AMDGPU::S_XOR_B32_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
+ ExecReg = AMDGPU::EXEC_LO;
+ } else {
+ MovExecOpc = AMDGPU::S_MOV_B64;
+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
+ XorTermOpc = AMDGPU::S_XOR_B64_term;
+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
+ ExecReg = AMDGPU::EXEC;
+ }
+
+#ifndef NDEBUG
+ const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
+
+ // Don't bother using generic instructions/registers for the exec mask.
+ B.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(InitSaveExecReg);
+
+ Register SavedExec = MRI.createVirtualRegister(WaveRC);
+
+ // To insert the loop we need to split the block. Move everything before
+ // this point to a new block, and insert a new empty block before this
+ // instruction.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, BodyBB);
+ MF.insert(MBBI, RestoreExecBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(BodyBB);
+ BodyBB->addSuccessor(RestoreExecBB);
+ BodyBB->addSuccessor(LoopBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
+
+ MBB.addSuccessor(LoopBB);
+ RestoreExecBB->addSuccessor(RemainderBB);
+
+ B.setInsertPt(*LoopBB, LoopBB->end());
+
+ // +-MBB:------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %Dst = MI %Vgpr |
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +-----------------+
+ // ->
+ // +-MBB-------------------------------+
+ // | ... |
+ // | %0 = G_INST_1 |
+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
+ // +----------------|------------------+
+ // | /------------------------------|
+ // V V |
+ // +-LoopBB---------------------------------------------------------------+ |
+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
+ // | instead of executing for each lane, see if other lanes had | |
+ // | same value for %Vgpr and execute for them also. | |
+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
+ // +----------------|-----------------------------------------------------+ |
+ // V |
+ // +-BodyBB------------------------------------------------------------+ |
+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
+ // | executed only for active lanes and written to Dst | |
+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
+ // | SI_WATERFALL_LOOP LoopBB |-----|
+ // +----------------|--------------------------------------------------+
+ // V
+ // +-RestoreExecBB--------------------------+
+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
+ // +----------------|-----------------------+
+ // V
+ // +-RemainderBB:----------------------+
+ // | %1 = G_INST_2 |
+ // | ... |
+ // +---------------------------------- +
+
+ // Move the instruction into the loop body. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
+
+ // Figure out the iterator range after splicing the instructions.
+ MachineBasicBlock::iterator NewBegin = Range.begin()->getIterator();
+ auto NewEnd = BodyBB->end();
+ assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
+ B.setMBB(*LoopBB);
+ Register CondReg;
+
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.all_uses()) {
+ Register OldReg = Op.getReg();
+ if (!SGPROperandRegs.count(OldReg))
+ continue;
+
+ // See if we already processed this register in another instruction in
+ // the sequence.
+ auto OldVal = WaterfalledRegMap.find(OldReg);
+ if (OldVal != WaterfalledRegMap.end()) {
+ Op.setReg(OldVal->second);
+ continue;
+ }
+
+ Register OpReg = Op.getReg();
+ LLT OpTy = MRI.getType(OpReg);
+
+ // TODO: support for agpr
+ assert(MRI.getRegBank(OpReg) == VgprRB);
+ Register CurrentLaneReg = MRI.createVirtualRegister({SgprRB, OpTy});
+ buildReadFirstLane(B, CurrentLaneReg, OpReg, RBI);
+
+ // Build the comparison(s), CurrentLaneReg == OpReg.
+ unsigned OpSize = OpTy.getSizeInBits();
+ bool Is64 = OpSize % 64 == 0;
+ unsigned PartSize = Is64 ? 64 : 32;
+ LLT PartTy = LLT::scalar(PartSize);
+ unsigned NumParts = OpSize / PartSize;
+ SmallVector<Register, 8> OpParts;
+ SmallVector<Register, 8> CurrentLaneParts;
+
+ if (NumParts == 1) {
+ OpParts.push_back(OpReg);
+ CurrentLaneParts.push_back(CurrentLaneReg);
+ } else {
+ auto UnmergeOp = B.buildUnmerge({VgprRB, PartTy}, OpReg);
+ auto UnmergeCurrLane = B.buildUnmerge({SgprRB, PartTy}, CurrentLaneReg);
+ for (unsigned i = 0; i < NumParts; ++i) {
+ OpParts.push_back(UnmergeOp.getReg(i));
+ CurrentLaneParts.push_back(UnmergeCurrLane.getReg(i));
+ }
+ }
+
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register CmpReg = MRI.createVirtualRegister(VccRB_S1);
+ B.buildICmp(CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
+
+ if (!CondReg) {
+ CondReg = CmpReg;
+ } else {
+ CondReg = B.buildAnd(VccRB_S1, CondReg, CmpReg).getReg(0);
+ }
+ }
+
+ Op.setReg(CurrentLaneReg);
+
+ // Make sure we don't re-process this register again.
+ WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
+ }
+ }
+
+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
+ Register CondRegLM =
+ MRI.createVirtualRegister({WaveRC, LLT::scalar(ST.isWave32() ? 32 : 64)});
+ B.buildIntrinsic(Intrinsic::amdgcn_ballot, CondRegLM).addReg(CondReg);
+
+ // Update EXEC, save the original EXEC value to SavedExec.
+ B.buildInstr(AndSaveExecOpc)
+ .addDef(SavedExec)
+ .addReg(CondRegLM, RegState::Kill);
+ MRI.setSimpleHint(SavedExec, CondRegLM);
+
+ B.setInsertPt(*BodyBB, BodyBB->end());
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
+ B.buildInstr(XorTermOpc).addDef(ExecReg).addReg(ExecReg).addReg(SavedExec);
+
+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
+ // s_cbranch_scc0?
+
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
+ B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
+
+ // Save the EXEC mask before the loop.
+ B.setInsertPt(MBB, MBB.end());
+ B.buildInstr(MovExecOpc).addDef(SaveExecReg).addReg(ExecReg);
+
+ // Restore the EXEC mask after the loop.
+ B.setInsertPt(*RestoreExecBB, RestoreExecBB->begin());
+ B.buildInstr(MovExecTermOpc).addDef(ExecReg).addReg(SaveExecReg);
+
+ // Set the insert point after the original instruction, so any new
+ // instructions will be in the remainder.
+ B.setInsertPt(*RemainderBB, RemainderBB->begin());
+
+ return true;
+}
+
void RegBankLegalizeHelper::splitLoad(MachineInstr &MI,
ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
MachineFunction &MF = B.getMF();
@@ -395,7 +615,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
switch (Mapping.LoweringMethod) {
case DoNotLower:
- return;
+ break;
case VccExtToSel:
return lowerVccExtToSel(MI);
case UniExtToSel: {
@@ -531,7 +751,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
}
}
- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
+ if (!WaterfallSgprs.empty()) {
+ MachineBasicBlock::iterator I = MI.getIterator();
+ executeInWaterfallLoop(B, make_range(I, std::next(I)), WaterfallSgprs);
+ }
}
LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
@@ -543,6 +766,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case Vgpr16:
return LLT::scalar(16);
case Sgpr32:
+ case Sgpr32_W:
case Sgpr32Trunc:
case Sgpr32AExt:
case Sgpr32AExtBoolInReg:
@@ -578,6 +802,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
case VgprV2S32:
return LLT::fixed_vector(2, 32);
case SgprV4S32:
+ case SgprV4S32_W:
case VgprV4S32:
case UniInVgprV4S32:
return LLT::fixed_vector(4, 32);
@@ -645,6 +870,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
return VccRB;
case Sgpr16:
case Sgpr32:
+ case Sgpr32_W:
case Sgpr64:
case SgprP1:
case SgprP3:
@@ -653,6 +879,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
case SgprV2S16:
case SgprV2S32:
case SgprV4S32:
+ case SgprV4S32_W:
case SgprB32:
case SgprB64:
case SgprB96:
@@ -894,6 +1121,15 @@ void RegBankLegalizeHelper::applyMappingSrc(
}
break;
}
+ // sgpr waterfall, scalars and vectors
+ case Sgpr32_W:
+ case SgprV4S32_W: {
+ assert(Ty == getTyFromID(MethodIDs[i]));
+ if (RB != SgprRB) {
+ SgprWaterfallOperandRegs.insert(Reg);
+ }
+ break;
+ }
// sgpr and vgpr scalars with extend
case Sgpr32AExt: {
// Note: this ext allows S1, and it is meant to be combined away.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 128dc04be4d49..6a9aec811e999 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -202,9 +202,9 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
Ty == LLT::pointer(3, 32) || Ty == LLT::pointer(5, 32) ||
Ty == LLT::pointer(6, 32))
return B32;
- if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
- Ty == LLT::fixed_vector(4, 16) || Ty == LLT::pointer(1, 64) ||
- Ty == LLT::pointer(4, 64) ||
+ if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64) ||
+ Ty == LLT::fixed_vector(2, 32) || Ty == LLT::fixed_vector(4, 16) ||
+ Ty == LLT::pointer(1, 64) || Ty == LLT::pointer(4, 64) ||
(Ty.isPointer() && Ty.getAddressSpace() > AMDGPUAS::MAX_AMDGPU_ADDRESS))
return B64;
if (Ty == LLT::fixed_vector(3, 32))
@@ -504,7 +504,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_ICMP})
.Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
- .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}});
+ .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
+ .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
addRulesForGOpcs({G_FCMP})
.Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}})
@@ -641,6 +642,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
// clang-format off
addRulesForGOpcs({G_LOAD})
.Any({{DivB32, DivP0}, {{VgprB32}, {VgprP0}}})
+ .Any({{DivB32, UniP0}, {{VgprB32}, {VgprP0}}})
.Any({{DivB32, DivP1}, {{VgprB32}, {VgprP1}}})
.Any({{{UniB256, UniP1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
@@ -660,6 +662,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB96, UniP4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasUnalignedLoads)
.Any({{{UniB96, UniP4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasUnalignedLoads)
+ .Any({{{UniB128, UniP4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
.Any({{{UniB256, UniP4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
.Any({{{UniB512, UniP4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
.Any({{{UniB32, UniP4}, !isNaturalAlignedSmall || !isUL}, {{UniInVgprB32}, {VgprP4}}}, hasSMRDSmall) // i8 and i16 load
@@ -674,11 +677,15 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
.Any({{{UniB32, UniP4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {VgprP4}}});
// clang-format on
- addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, Vector)
- .Div(S32, {{Vgpr32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(S32, {{UniInVgprS32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Div(V4S32, {{VgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}})
- .Uni(V4S32, {{UniInVgprV4S32}, {SgprV4S32, Vgpr32, Vgpr32, Sgpr32}});
+ addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD}, StandardB)
+ .Div(B32, {{VgprB32}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Uni(B32, {{UniInVgprB32}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Div(B64, {{VgprB64}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Uni(B64, {{UniInVgprB64}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Div(B96, {{VgprB96}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Uni(B96, {{UniInVgprB96}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Div(B128, {{VgprB128}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}})
+ .Uni(B128, {{UniInVgprB128}, {SgprV4S32_W, Vgpr32, Vgpr32, Sgpr32_W}});
addRulesForGOpcs({G_STORE})
.Any({{S32, P0}, {{}, {Vgpr32, VgprP0}}})
@@ -692,7 +699,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
addRulesForGOpcs({G_PTR_ADD})
....
[truncated]
|
return buildReadLane( | ||
B, SgprDst, VgprSrc, RBI, | ||
[](MachineIRBuilder &B, Register SgprDst, Register VgprSrc) { | ||
return B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, SgprDst) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not for this PR, but we should really have an opcode for this too instead of having one being an intrinsic and one being a generic opcode
Sgpr32_W, | ||
SgprV4S32_W, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a trailing comment or rename this ? The _W
suffix is not immediately clear to me
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added one above, is it clear now?
const SIRegisterInfo *TRI = ST.getRegisterInfo(); | ||
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); | ||
unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg; | ||
if (ST.isWave32()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I think those could go in the class directly so this isn't repeated everytime no ?
The class is instantiated per function anyway
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it is instantiated per ST, MRI pair, not per function
@@ -1,6 +1,6 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py | |||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-fast -o - %s | FileCheck %s | |||
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=regbankselect -regbankselect-greedy -o - %s | FileCheck %s | |||
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm Is it fine to move tests entirely to this new RBSelect, or should we keep coverage for both until the old RB is removed?
fcd0dc7
to
64d7853
Compare
6dd26d4
to
ae96216
Compare
Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering for divergent operands that must be sgpr.
Add rules for G_AMDGPU_BUFFER_LOAD and implement waterfall lowering
for divergent operands that must be sgpr.