-
Notifications
You must be signed in to change notification settings - Fork 13.3k
AMDGPU: Factor agpr reg_sequence folding into a function #129002
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
arsenm
merged 3 commits into
main
from
users/arsenm/amdgpu/move-agpr-through-reg-sequence-to-helper-function
Feb 27, 2025
Merged
AMDGPU: Factor agpr reg_sequence folding into a function #129002
arsenm
merged 3 commits into
main
from
users/arsenm/amdgpu/move-agpr-through-reg-sequence-to-helper-function
Feb 27, 2025
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFull diff: https://github.com/llvm/llvm-project/pull/129002.diff 1 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3a019dbaad02c..f1ba199fbae3f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -128,6 +128,8 @@ class SIFoldOperandsImpl {
bool tryFoldCndMask(MachineInstr &MI) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+ bool foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const;
bool tryFoldFoldableCopy(MachineInstr &MI,
MachineOperand *&CurrentKnownM0Val) const;
@@ -1012,7 +1014,6 @@ void SIFoldOperandsImpl::foldOperand(
UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
- unsigned Size = TII->getOpSize(*UseMI, 1);
Register UseReg = OpToFold.getReg();
UseMI->getOperand(1).setReg(UseReg);
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
@@ -1021,84 +1022,9 @@ void SIFoldOperandsImpl::foldOperand(
OpToFold.setIsKill(false);
// Remove kill flags as kills may now be out of order with uses.
- MRI->clearKillFlags(OpToFold.getReg());
-
- // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
- // can only accept VGPR or inline immediate. Recreate a reg_sequence with
- // its initializers right here, so we will rematerialize immediates and
- // avoid copies via different reg classes.
- SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
- if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
- getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- const DebugLoc &DL = UseMI->getDebugLoc();
- MachineBasicBlock &MBB = *UseMI->getParent();
-
- UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
- for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
- UseMI->removeOperand(I);
-
- MachineInstrBuilder B(*MBB.getParent(), UseMI);
- DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
- SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
- for (unsigned I = 0; I < Size / 4; ++I) {
- MachineOperand *Def = Defs[I].first;
- TargetInstrInfo::RegSubRegPair CopyToVGPR;
- if (Def->isImm() &&
- TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
- int64_t Imm = Def->getImm();
-
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
- B.addReg(Tmp);
- } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
- auto Src = getRegSubRegPair(*Def);
- Def->setIsKill(false);
- if (!SeenAGPRs.insert(Src)) {
- // We cannot build a reg_sequence out of the same registers, they
- // must be copied. Better do it here before copyPhysReg() created
- // several reads to do the AGPR->VGPR->AGPR copy.
- CopyToVGPR = Src;
- } else {
- B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
- Src.SubReg);
- }
- } else {
- assert(Def->isReg());
- Def->setIsKill(false);
- auto Src = getRegSubRegPair(*Def);
-
- // Direct copy from SGPR to AGPR is not possible. To avoid creation
- // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
- // create a copy here and track if we already have such a copy.
- if (TRI->isSGPRReg(*MRI, Src.Reg)) {
- CopyToVGPR = Src;
- } else {
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
- B.addReg(Tmp);
- }
- }
-
- if (CopyToVGPR.Reg) {
- auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
- Register &Vgpr = It->second;
- if (Inserted) {
- Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
- }
- auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
- BuildMI(MBB, UseMI, DL,
- TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
- B.addReg(Tmp);
- }
-
- B.addImm(Defs[I].second);
- }
- LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
- }
-
- return;
+ MRI->clearKillFlags(UseReg);
+ if (foldCopyToAGPRRegSequence(UseMI))
+ return;
}
unsigned UseOpc = UseMI->getOpcode();
@@ -1558,6 +1484,88 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
return true;
}
+/// Fold %agpr = COPY (REG_SEQUENCE x_MOV_B32, ...) into REG_SEQUENCE
+/// (V_ACCVGPR_WRITE_B32_e64) ... depending on the reg_sequence input values.
+bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
+ // It is very tricky to store a value into an AGPR. v_accvgpr_write_b32 can
+ // only accept VGPR or inline immediate. Recreate a reg_sequence with its
+ // initializers right here, so we will rematerialize immediates and avoid
+ // copies via different reg classes.
+ if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
+ return false;
+ Register UseReg = CopyMI->getOperand(1).getReg();
+ SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
+ return false;
+
+ const DebugLoc &DL = CopyMI->getDebugLoc();
+ MachineBasicBlock &MBB = *CopyMI->getParent();
+
+ CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
+ for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
+ CopyMI->removeOperand(I);
+
+ MachineInstrBuilder B(*MBB.getParent(), CopyMI);
+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
+ for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
+ MachineOperand *Def = Defs[I].first;
+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
+ if (Def->isImm() &&
+ TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ int64_t Imm = Def->getImm();
+
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+ .addImm(Imm);
+ B.addReg(Tmp);
+ } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
+ auto Src = getRegSubRegPair(*Def);
+ Def->setIsKill(false);
+ if (!SeenAGPRs.insert(Src)) {
+ // We cannot build a reg_sequence out of the same registers, they
+ // must be copied. Better do it here before copyPhysReg() created
+ // several reads to do the AGPR->VGPR->AGPR copy.
+ CopyToVGPR = Src;
+ } else {
+ B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
+ }
+ } else {
+ assert(Def->isReg());
+ Def->setIsKill(false);
+ auto Src = getRegSubRegPair(*Def);
+
+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
+ // create a copy here and track if we already have such a copy.
+ if (TRI->isSGPRReg(*MRI, Src.Reg)) {
+ CopyToVGPR = Src;
+ } else {
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
+ B.addReg(Tmp);
+ }
+ }
+
+ if (CopyToVGPR.Reg) {
+ auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
+ Register &Vgpr = It->second;
+ if (Inserted) {
+ Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
+ }
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
+ .addReg(Vgpr);
+ B.addReg(Tmp);
+ }
+
+ B.addImm(Defs[I].second);
+ }
+ LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
+ return true;
+}
+
bool SIFoldOperandsImpl::tryFoldFoldableCopy(
MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
Register DstReg = MI.getOperand(0).getReg();
|
JanekvO
approved these changes
Feb 27, 2025
joaosaffran
pushed a commit
to joaosaffran/llvm-project
that referenced
this pull request
Mar 3, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
No description provided.