Skip to content

AMDGPU: Try to perform copy to agpr from reg_sequence at the copy #129463

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 106 additions & 47 deletions llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1510,76 +1510,128 @@ bool SIFoldOperandsImpl::foldCopyToAGPRRegSequence(MachineInstr *CopyMI) const {
// only accept VGPR or inline immediate. Recreate a reg_sequence with its
// initializers right here, so we will rematerialize immediates and avoid
// copies via different reg classes.
if (!TRI->isAGPR(*MRI, CopyMI->getOperand(0).getReg()))
const TargetRegisterClass *DefRC =
MRI->getRegClass(CopyMI->getOperand(0).getReg());
if (!TRI->isAGPRClass(DefRC))
return false;

Register UseReg = CopyMI->getOperand(1).getReg();
SmallVector<std::pair<MachineOperand *, unsigned>, 32> Defs;
if (!getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32))
MachineInstr *RegSeq = MRI->getVRegDef(UseReg);
if (!RegSeq || !RegSeq->isRegSequence())
return false;

const DebugLoc &DL = CopyMI->getDebugLoc();
MachineBasicBlock &MBB = *CopyMI->getParent();

MachineInstrBuilder B(*MBB.getParent(), CopyMI);
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenInputs;

const TargetRegisterClass *UseRC =
MRI->getRegClass(CopyMI->getOperand(1).getReg());

// Value, subregindex for new REG_SEQUENCE
SmallVector<std::pair<MachineOperand *, unsigned>, 32> NewDefs;

unsigned NumRegSeqOperands = RegSeq->getNumOperands();
unsigned NumFoldable = 0;

for (unsigned I = 1; I != NumRegSeqOperands; I += 2) {
MachineOperand &RegOp = RegSeq->getOperand(I);
unsigned SubRegIdx = RegSeq->getOperand(I + 1).getImm();

if (RegOp.getSubReg()) {
// TODO: Handle subregister compose
NewDefs.emplace_back(&RegOp, SubRegIdx);
continue;
}

MachineOperand *Lookup = lookUpCopyChain(*TII, *MRI, RegOp.getReg());
if (!Lookup)
Lookup = &RegOp;

if (Lookup->isImm()) {
// Check if this is an agpr_32 subregister.
const TargetRegisterClass *DestSuperRC = TRI->getMatchingSuperRegClass(
DefRC, &AMDGPU::AGPR_32RegClass, SubRegIdx);
if (DestSuperRC &&
TII->isInlineConstant(*Lookup, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
++NumFoldable;
NewDefs.emplace_back(Lookup, SubRegIdx);
continue;
}
}

const TargetRegisterClass *InputRC =
Lookup->isReg() ? MRI->getRegClass(Lookup->getReg())
: MRI->getRegClass(RegOp.getReg());

// TODO: Account for Lookup->getSubReg()

// If we can't find a matching super class, this is an SGPR->AGPR or
// VGPR->AGPR subreg copy (or something constant-like we have to materialize
// in the AGPR). We can't directly copy from SGPR to AGPR on gfx908, so we
// want to rewrite to copy to an intermediate VGPR class.
const TargetRegisterClass *MatchRC =
TRI->getMatchingSuperRegClass(DefRC, InputRC, SubRegIdx);
if (!MatchRC) {
++NumFoldable;
NewDefs.emplace_back(&RegOp, SubRegIdx);
continue;
}

NewDefs.emplace_back(&RegOp, SubRegIdx);
}

// Do not clone a reg_sequence and merely change the result register class.
if (NumFoldable == 0)
return false;
Comment on lines +1533 to +1588
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forgive my ignorance, but why isn't it sufficient to use (or modify) getRegSeqInit?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

getRegSeqInit bakes in the assumption that every entry is 32-bits. This should handle arbitrary subregister indexes, and has the heuristic checks mixed in with the source look throughs


CopyMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
for (unsigned I = CopyMI->getNumOperands() - 1; I > 0; --I)
CopyMI->removeOperand(I);

MachineInstrBuilder B(*MBB.getParent(), CopyMI);
DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
for (unsigned I = 0, NumElts = Defs.size(); I != NumElts; ++I) {
MachineOperand *Def = Defs[I].first;
TargetInstrInfo::RegSubRegPair CopyToVGPR;
if (Def->isImm() &&
TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
int64_t Imm = Def->getImm();

auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
for (auto [Def, DestSubIdx] : NewDefs) {
if (!Def->isReg()) {
// TODO: Should we use single write for each repeated value like in
// register case?
Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp)
.addImm(Imm);
.add(*Def);
B.addReg(Tmp);
} else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
auto Src = getRegSubRegPair(*Def);
} else {
TargetInstrInfo::RegSubRegPair Src = getRegSubRegPair(*Def);
Def->setIsKill(false);
if (!SeenAGPRs.insert(Src)) {

Register &VGPRCopy = VGPRCopies[Src];
if (!VGPRCopy) {
const TargetRegisterClass *VGPRUseSubRC =
TRI->getSubRegisterClass(UseRC, DestSubIdx);

// We cannot build a reg_sequence out of the same registers, they
// must be copied. Better do it here before copyPhysReg() created
// several reads to do the AGPR->VGPR->AGPR copy.
CopyToVGPR = Src;
} else {
B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0, Src.SubReg);
}
} else {
assert(Def->isReg());
Def->setIsKill(false);
auto Src = getRegSubRegPair(*Def);

// Direct copy from SGPR to AGPR is not possible. To avoid creation
// of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
// create a copy here and track if we already have such a copy.
if (TRI->isSGPRReg(*MRI, Src.Reg)) {
CopyToVGPR = Src;
// Direct copy from SGPR to AGPR is not possible on gfx908. To avoid
// creation of exploded copies SGPR->VGPR->AGPR in the copyPhysReg()
// later, create a copy here and track if we already have such a copy.
if (TRI->getSubRegisterClass(MRI->getRegClass(Src.Reg), Src.SubReg) !=
VGPRUseSubRC) {
Comment on lines +1615 to +1619
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does this check for SGPR (as the comments explain)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The register bank mismatch is covered by the generic subregister class logic

VGPRCopy = MRI->createVirtualRegister(VGPRUseSubRC);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), VGPRCopy).add(*Def);
B.addReg(VGPRCopy);
} else {
// If it is already a VGPR, do not copy the register.
B.add(*Def);
}
} else {
auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
B.addReg(Tmp);
B.addReg(VGPRCopy);
}
}

if (CopyToVGPR.Reg) {
auto [It, Inserted] = VGPRCopies.try_emplace(CopyToVGPR);
Register &Vgpr = It->second;
if (Inserted) {
Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
}
Register Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
BuildMI(MBB, CopyMI, DL, TII->get(AMDGPU::COPY), Tmp).addReg(Vgpr);
B.addReg(Tmp);
}

B.addImm(Defs[I].second);
B.addImm(DestSubIdx);
}

LLVM_DEBUG(dbgs() << "Folded " << *CopyMI);
return true;
}
Expand Down Expand Up @@ -1634,6 +1686,13 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
foldCopyToVGPROfScalarAddOfFrameIndex(DstReg, OpToFold.getReg(), MI))
return true;

// Fold copy to AGPR through reg_sequence
// TODO: Handle with subregister extract
if (OpToFold.isReg() && MI.isCopy() && !MI.getOperand(1).getSubReg()) {
if (foldCopyToAGPRRegSequence(&MI))
return true;
}

bool Changed = foldInstOperand(MI, OpToFold);

// If we managed to fold all uses of this copy then we might as well
Expand Down
66 changes: 31 additions & 35 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x24
; GCN-NEXT: s_mov_b64 s[36:37], 1
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[36:37], s[36:37] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s36, 2
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[36:37], s[36:37] op_sel:[0,1]
; GCN-NEXT: s_mov_b32 s38, 2
; GCN-NEXT: s_mov_b32 s39, s37
Comment on lines +19 to +20
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why can't it use s[36:37] as it did before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This amounts to reordering, the v_pk_mov_b32 just moved below. I don't expect consistent regalloc decisions with movement

; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[0:15], s[34:35], 0x0
; GCN-NEXT: s_load_dwordx16 s[16:31], s[34:35], 0x40
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[38:39], s[38:39] op_sel:[0,1]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a16, s16
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
Expand All @@ -38,7 +40,6 @@ define amdgpu_kernel void @test_mfma_f32_32x32x4bf16_1k(ptr addrspace(1) %arg) #
; GCN-NEXT: v_accvgpr_write_b32 a13, s13
; GCN-NEXT: v_accvgpr_write_b32 a14, s14
; GCN-NEXT: v_accvgpr_write_b32 a15, s15
; GCN-NEXT: v_accvgpr_write_b32 a16, s16
; GCN-NEXT: v_accvgpr_write_b32 a17, s17
; GCN-NEXT: v_accvgpr_write_b32 a18, s18
; GCN-NEXT: v_accvgpr_write_b32 a19, s19
Expand Down Expand Up @@ -317,31 +318,29 @@ bb:
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GCN-NEXT: s_mov_b64 s[0:1], 0
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x34
; GCN-NEXT: s_mov_b64 s[6:7], 1.0
; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-NEXT: s_mov_b64 s[8:9], 0
; GCN-NEXT: v_accvgpr_write_b32 a0, s8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
; GCN-NEXT: v_accvgpr_write_b32 a4, s4
; GCN-NEXT: v_accvgpr_write_b32 a5, s5
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: v_accvgpr_write_b32 a2, s8
; GCN-NEXT: v_accvgpr_write_b32 a4, s8
; GCN-NEXT: v_accvgpr_write_b32 a6, s6
; GCN-NEXT: v_accvgpr_write_b32 a1, s9
; GCN-NEXT: v_accvgpr_write_b32 a3, s9
; GCN-NEXT: v_accvgpr_write_b32 a5, s9
; GCN-NEXT: v_accvgpr_write_b32 a7, s7
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[10:11], s[10:11] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
Expand All @@ -352,32 +351,29 @@ bb:
define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %arg, double %a, double %b) #0 {
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: s_mov_b32 s1, 0x405ec000
; GCN-NEXT: s_mov_b64 s[2:3], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
; GCN-NEXT: s_mov_b32 s6, 0
; GCN-NEXT: s_mov_b32 s7, 0x405ec000
; GCN-NEXT: v_accvgpr_write_b32 a0, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
; GCN-NEXT: s_mov_b64 s[4:5], s[0:1]
; GCN-NEXT: s_mov_b64 s[6:7], s[0:1]
; GCN-NEXT: v_accvgpr_write_b32 a0, s0
; GCN-NEXT: v_accvgpr_write_b32 a1, s1
; GCN-NEXT: v_accvgpr_write_b32 a2, s2
; GCN-NEXT: v_accvgpr_write_b32 a3, s3
; GCN-NEXT: v_accvgpr_write_b32 a4, s4
; GCN-NEXT: v_accvgpr_write_b32 a5, s5
; GCN-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: v_accvgpr_write_b32 a2, s6
; GCN-NEXT: v_accvgpr_write_b32 a4, s6
; GCN-NEXT: v_accvgpr_write_b32 a6, s6
; GCN-NEXT: v_accvgpr_write_b32 a1, s7
; GCN-NEXT: v_accvgpr_write_b32 a3, s7
; GCN-NEXT: v_accvgpr_write_b32 a5, s7
; GCN-NEXT: v_accvgpr_write_b32 a7, s7
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
; GCN-NEXT: v_pk_mov_b32 v[2:3], s[8:9], s[8:9] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
; GCN-NEXT: s_endpgm
bb:
%mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 123.0, double 123.0, double 123.0, double 123.0>, i32 0, i32 0, i32 0)
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/AMDGPU/acc-ldst.ll
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,11 @@ bb:

; NB: for atomics both vdata and vdst shall be either VGPR or AGPR
; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic_store:
; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0
; GCN: global_atomic_sub [[IN:v[0-9]+]], v{{[0-9:]+}}, v{{[0-9]+}}, s[{{[0-9:]+}}] glc
; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
; GCN-DAG: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[IN]]
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
; GCN: v_accvgpr_read_b32 [[V:v[0-9]+]], a[[N]]{{$}}
; GCN: global_atomic_add v{{[0-9]+}}, v{{[0-9:]+}}, [[V]], s[{{[0-9:]+}}] glc
Expand All @@ -217,7 +217,10 @@ bb:

; GCN-LABEL: {{^}}test_atomic_mfma_4xi32_atomic64_store:
; GCN: global_atomic_sub_x2 v[{{[0-9:]+}}], v{{[0-9:]+}}, v[{{[0-9:]+}}], s[{{[0-9:]+}}] glc
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_accvgpr_write_b32 [[A_ZERO:a[0-9]+]], 0
; GCN: v_accvgpr_mov_b32 a{{[0-9]+}}, [[A_ZERO]]
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
; GCN: v_mfma_i32_4x4x4i8 a[[[N:[0-9]+]]:
; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
; GCN: v_accvgpr_read_b32 v{{[0-9]+}}, a{{[0-9]+}}
Expand Down
Loading
Loading