-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU: Try to perform copy to agpr from reg_sequence at the copy #129463
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1510,76 +1510,128 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const { | |
// only accept VGPR or inline immediate. Recreate a reg_sequence with its | ||
// initializers right here, so we will rematerialize immediates and avoid | ||
// copies via different reg classes. | ||
if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg())) | ||
const TargetRegisterClass *DefRC = | ||
MRI->getRegClass(CopyMI->getOperand(0).getReg()); | ||
if (!TRI->isAGPRClass(DefRC)) | ||
return false; | ||
|
||
Register UseReg = CopyMI->getOperand(1).getReg(); | ||
SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs; | ||
if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) | ||
MachineInstr *RegSeq = MRI->getVRegDef(UseReg); | ||
if (!RegSeq || !RegSeq->isRegSequence()) | ||
return false; | ||
|
||
const DebugLoc &DL = CopyMI->getDebugLoc(); | ||
MachineBasicBlock &MBB = *CopyMI->getParent(); | ||
|
||
MachineInstrBuilder B(*MBB.getParent(), CopyMI); | ||
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; | ||
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenInputs; | ||
|
||
const TargetRegisterClass *UseRC = | ||
MRI->getRegClass(CopyMI->getOperand(1).getReg()); | ||
|
||
// Value, subregindex for new REG_SEQUENCE | ||
SmallVector<std::pair<MachineOperand *, unsigned>, 32> NewDefs; | ||
|
||
unsigned NumRegSeqOperands = RegSeq->getNumOperands(); | ||
unsigned NumFoldable = 0; | ||
|
||
for (unsigned I = 1; I != NumRegSeqOperands; I += 2) { | ||
MachineOperand &RegOp = RegSeq->getOperand(I); | ||
unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm(); | ||
|
||
if (RegOp.getSubReg()) { | ||
// TODO: Handle subregister compose | ||
NewDefs.emplace_back(&RegOp, SubRegIdx); | ||
continue; | ||
} | ||
|
||
MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg()); | ||
if (!Lookup) | ||
Lookup = &RegOp; | ||
|
||
if (Lookup->isImm()) { | ||
// Check if this is an agpr_32 subregister. | ||
const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass( | ||
DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx); | ||
if (DestSuperRC && | ||
TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { | ||
++NumFoldable; | ||
NewDefs.emplace_back(Lookup, SubRegIdx); | ||
continue; | ||
} | ||
} | ||
|
||
const TargetRegisterClass *InputRC = | ||
Lookup->isReg() ? MRI->getRegClass(Lookup->getReg()) | ||
: MRI->getRegClass(RegOp.getReg()); | ||
|
||
// TODO: Account for Lookup->getSubReg() | ||
|
||
// If we can't find a matching super class, this is an SGPR->AGPR or | ||
// VGPR->AGPR subreg copy (or something constant-like we have to materialize | ||
// in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we | ||
// want to rewrite to copy to an intermediate VGPR class. | ||
const TargetRegisterClass *MatchRC = | ||
TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx); | ||
if (!MatchRC) { | ||
++NumFoldable; | ||
NewDefs.emplace_back(&RegOp, SubRegIdx); | ||
continue; | ||
} | ||
|
||
NewDefs.emplace_back(&RegOp, SubRegIdx); | ||
} | ||
|
||
// Do not clone a reg_sequence and merely change the result register class. | ||
if (NumFoldable == 0) | ||
return false; | ||
|
||
CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE)); | ||
for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I) | ||
CopyMI->removeOperand(I); | ||
|
||
MachineInstrBuilder B(*MBB.getParent(), CopyMI); | ||
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies; | ||
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs; | ||
for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) { | ||
MachineOperand *Def = Defs[I].first; | ||
TargetInstrInfo::RegSubRegPair CopyToVGPR; | ||
if (Def->isImm() && | ||
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { | ||
int64_t Imm = Def->getImm(); | ||
|
||
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); | ||
for (auto [Def, DestSubIdx] : NewDefs) { | ||
if (!Def->isReg()) { | ||
// TODO: Should we use single write for each repeated value like in | ||
// register case? | ||
Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); | ||
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp) | ||
.addImm(Imm); | ||
.add(*Def); | ||
B.addReg(Tmp); | ||
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) { | ||
auto Src = getRegSubRegPair(*Def); | ||
} else { | ||
TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def); | ||
Def->setIsKill(false); | ||
if (!SeenAGPRs.insert(Src)) { | ||
|
||
Register &VGPRCopy = VGPRCopies[Src]; | ||
if (!VGPRCopy) { | ||
const TargetRegisterClass *VGPRUseSubRC = | ||
TRI->getSubRegisterClass(UseRC, DestSubIdx); | ||
|
||
// We cannot build a reg_sequence out of the same registers, they | ||
// must be copied. Better do it here before copyPhysReg() created | ||
// several reads to do the AGPR->VGPR->AGPR copy. | ||
CopyToVGPR = Src; | ||
} else { | ||
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg); | ||
} | ||
} else { | ||
assert(Def->isReg()); | ||
Def->setIsKill(false); | ||
auto Src = getRegSubRegPair(*Def); | ||
|
||
// Direct copy from SGPR to AGPR is not possible. To avoid creation | ||
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later, | ||
// create a copy here and track if we already have such a copy. | ||
if (TRI->isSGPRReg(*MRI, Src.Reg)) { | ||
CopyToVGPR = Src; | ||
// Direct copy from SGPR to AGPR is not possible on gfx908. To avoid | ||
// creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() | ||
// later, create a copy here and track if we already have such a copy. | ||
if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) != | ||
VGPRUseSubRC) { | ||
Comment on lines
+1615
to
+1619
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does this check for SGPR (as the comments explain)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The register bank mismatch is covered by the generic subregister class logic |
||
VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC); | ||
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def); | ||
B.addReg(VGPRCopy); | ||
} else { | ||
// If it is already a VGPR, do not copy the register. | ||
B.add(*Def); | ||
} | ||
} else { | ||
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); | ||
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def); | ||
B.addReg(Tmp); | ||
B.addReg(VGPRCopy); | ||
} | ||
} | ||
|
||
if (CopyToVGPR.Reg) { | ||
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR); | ||
Register &Vgpr = It->second; | ||
if (Inserted) { | ||
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); | ||
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def); | ||
} | ||
Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass); | ||
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).addReg(Vgpr); | ||
B.addReg(Tmp); | ||
} | ||
|
||
B.addImm(Defs[I].second); | ||
B.addImm(DestSubIdx); | ||
} | ||
|
||
LLVM_DEBUG(dbgs() << "Folded " << *CopyMI); | ||
return true; | ||
} | ||
|
@@ -1634,6 +1686,13 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy( | |
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI)) | ||
return true; | ||
|
||
// Fold copy to AGPR through reg_sequence | ||
// TODO: Handle with subregister extract | ||
if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) { | ||
if (foldCopyToAGPRRegSequence(&MI)) | ||
return true; | ||
} | ||
|
||
bool Changed = foldInstOperand(MI, OpToFold); | ||
|
||
// If we managed to fold all uses of this copy then we might as well | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # | |
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24 | ||
; GCN-NEXT: s_mov_b64 s[36:37], 1 | ||
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1] | ||
; GCN-NEXT: s_mov_b32 s36, 2 | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1] | ||
; GCN-NEXT: s_mov_b32 s38, 2 | ||
; GCN-NEXT: s_mov_b32 s39, s37 | ||
Comment on lines
+19
to
+20
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't it use s[36:37] as it did before? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This amounts to reordering, the v_pk_mov_b32 just moved below. I don't expect consistent regalloc decisions with movement |
||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0 | ||
; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40 | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1] | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_accvgpr_write_b32 a0, s0 | ||
; GCN-NEXT: v_accvgpr_write_b32 a16, s16 | ||
; GCN-NEXT: v_accvgpr_write_b32 a1, s1 | ||
; GCN-NEXT: v_accvgpr_write_b32 a2, s2 | ||
; GCN-NEXT: v_accvgpr_write_b32 a3, s3 | ||
|
@@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) # | |
; GCN-NEXT: v_accvgpr_write_b32 a13, s13 | ||
; GCN-NEXT: v_accvgpr_write_b32 a14, s14 | ||
; GCN-NEXT: v_accvgpr_write_b32 a15, s15 | ||
; GCN-NEXT: v_accvgpr_write_b32 a16, s16 | ||
; GCN-NEXT: v_accvgpr_write_b32 a17, s17 | ||
; GCN-NEXT: v_accvgpr_write_b32 a18, s18 | ||
; GCN-NEXT: v_accvgpr_write_b32 a19, s19 | ||
|
@@ -317,31 +318,29 @@ bb: | |
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 { | ||
; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: | ||
; GCN: ; %bb.0: ; %bb | ||
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 | ||
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 | ||
; GCN-NEXT: s_mov_b64 s[0:1], 0 | ||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 | ||
; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34 | ||
; GCN-NEXT: s_mov_b64 s[6:7], 1.0 | ||
; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] | ||
; GCN-NEXT: s_mov_b64 s[8:9], 0 | ||
; GCN-NEXT: v_accvgpr_write_b32 a0, s8 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] | ||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] | ||
; GCN-NEXT: v_accvgpr_write_b32 a0, s0 | ||
; GCN-NEXT: v_accvgpr_write_b32 a1, s1 | ||
; GCN-NEXT: v_accvgpr_write_b32 a2, s2 | ||
; GCN-NEXT: v_accvgpr_write_b32 a3, s3 | ||
; GCN-NEXT: v_accvgpr_write_b32 a4, s4 | ||
; GCN-NEXT: v_accvgpr_write_b32 a5, s5 | ||
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] | ||
; GCN-NEXT: v_accvgpr_write_b32 a2, s8 | ||
; GCN-NEXT: v_accvgpr_write_b32 a4, s8 | ||
; GCN-NEXT: v_accvgpr_write_b32 a6, s6 | ||
; GCN-NEXT: v_accvgpr_write_b32 a1, s9 | ||
; GCN-NEXT: v_accvgpr_write_b32 a3, s9 | ||
; GCN-NEXT: v_accvgpr_write_b32 a5, s9 | ||
; GCN-NEXT: v_accvgpr_write_b32 a7, s7 | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1] | ||
; GCN-NEXT: s_nop 1 | ||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] | ||
; GCN-NEXT: v_mov_b32_e32 v0, 0 | ||
; GCN-NEXT: s_nop 7 | ||
; GCN-NEXT: s_nop 7 | ||
; GCN-NEXT: s_nop 0 | ||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] | ||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 | ||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] | ||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 | ||
; GCN-NEXT: s_endpgm | ||
bb: | ||
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0) | ||
|
@@ -352,32 +351,29 @@ bb: | |
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 { | ||
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit: | ||
; GCN: ; %bb.0: ; %bb | ||
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 | ||
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34 | ||
; GCN-NEXT: s_mov_b32 s0, 0 | ||
; GCN-NEXT: s_mov_b32 s1, 0x405ec000 | ||
; GCN-NEXT: s_mov_b64 s[2:3], s[0:1] | ||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 | ||
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 | ||
; GCN-NEXT: s_mov_b32 s6, 0 | ||
; GCN-NEXT: s_mov_b32 s7, 0x405ec000 | ||
; GCN-NEXT: v_accvgpr_write_b32 a0, s6 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1] | ||
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] | ||
; GCN-NEXT: s_mov_b64 s[6:7], s[0:1] | ||
; GCN-NEXT: v_accvgpr_write_b32 a0, s0 | ||
; GCN-NEXT: v_accvgpr_write_b32 a1, s1 | ||
; GCN-NEXT: v_accvgpr_write_b32 a2, s2 | ||
; GCN-NEXT: v_accvgpr_write_b32 a3, s3 | ||
; GCN-NEXT: v_accvgpr_write_b32 a4, s4 | ||
; GCN-NEXT: v_accvgpr_write_b32 a5, s5 | ||
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] | ||
; GCN-NEXT: v_accvgpr_write_b32 a2, s6 | ||
; GCN-NEXT: v_accvgpr_write_b32 a4, s6 | ||
; GCN-NEXT: v_accvgpr_write_b32 a6, s6 | ||
; GCN-NEXT: v_accvgpr_write_b32 a1, s7 | ||
; GCN-NEXT: v_accvgpr_write_b32 a3, s7 | ||
; GCN-NEXT: v_accvgpr_write_b32 a5, s7 | ||
; GCN-NEXT: v_accvgpr_write_b32 a7, s7 | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1] | ||
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1] | ||
; GCN-NEXT: s_nop 1 | ||
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] | ||
; GCN-NEXT: v_mov_b32_e32 v0, 0 | ||
; GCN-NEXT: s_nop 7 | ||
; GCN-NEXT: s_nop 7 | ||
; GCN-NEXT: s_nop 0 | ||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9] | ||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16 | ||
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] | ||
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 | ||
; GCN-NEXT: s_endpgm | ||
bb: | ||
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Forgive my ignorance, but why isn't it sufficient to use (or modify)
getRegSeqInit
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
getRegSeqInit bakes in the assumption that every entry is 32-bits. This should handle arbitrary subregister indexes, and has the heuristic checks mixed in with the source look throughs