Skip to content

Commit b668b64

Browse files
authored
[AMDGPU][True16][CodeGen] legalize 16bit and 32bit use-def chain for moveToVALU in si-fix-sgpr-lowering (#138734)
Two changes in this patch: 1. Covered another case in legalizeOperandVALUt16 functions and the COPY lowering, when SALU16 is used by SALU32, need to insert a reg_sequence after moved to valu (previously only considered SALU32 used by SALU16 case) 2. Moved the useMI analysis into addUsersToMoveVALUList. Legalize the targetted operand when needed. Turn on frem test with true16 mode for gfx1150 which is failing before this patch. A few bitcast tests also impacted by this change with some v_mov being replaced to dual mov
1 parent 817af2d commit b668b64

File tree

9 files changed

+3451
-3437
lines changed

9 files changed

+3451
-3437
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 55 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7227,27 +7227,52 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
72277227
return DeferredList.contains(MI);
72287228
}
72297229

7230-
// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
7231-
// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
7232-
// subreg access properly. This can be removed after we have sgpr16 in place
7233-
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
7230+
// Legalize size mismatches between 16bit and 32bit registers in v2s copy
7231+
// lowering (change spgr to vgpr).
7232+
// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
7233+
// size. Need to legalize the size of the operands during the vgpr lowering
7234+
// chain. This can be removed after we have sgpr16 in place
7235+
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
72347236
MachineRegisterInfo &MRI) const {
7235-
unsigned Opcode = Inst.getOpcode();
7236-
if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
7237+
if (!ST.useRealTrue16Insts())
72377238
return;
72387239

7239-
for (MachineOperand &Op : Inst.explicit_operands()) {
7240-
unsigned OpIdx = Op.getOperandNo();
7241-
if (!OpIdx)
7242-
continue;
7243-
if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
7244-
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7245-
const TargetRegisterClass *RC = RI.getRegClass(RCID);
7246-
if (RI.getRegSizeInBits(*RC) == 16) {
7247-
Op.setSubReg(AMDGPU::lo16);
7248-
}
7249-
}
7250-
}
7240+
unsigned Opcode = MI.getOpcode();
7241+
MachineBasicBlock *MBB = MI.getParent();
7242+
// Legalize operands and check for size mismatch
7243+
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
7244+
OpIdx >= get(Opcode).getNumOperands())
7245+
return;
7246+
7247+
MachineOperand &Op = MI.getOperand(OpIdx);
7248+
if (!Op.isReg() || !Op.getReg().isVirtual())
7249+
return;
7250+
7251+
const TargetRegisterClass *CurrRC = MRI.getRegClass(Op.getReg());
7252+
if (!RI.isVGPRClass(CurrRC))
7253+
return;
7254+
7255+
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
7256+
const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
7257+
if (RI.getMatchingSuperRegClass(CurrRC, ExpectedRC, AMDGPU::lo16)) {
7258+
Op.setSubReg(AMDGPU::lo16);
7259+
} else if (RI.getMatchingSuperRegClass(ExpectedRC, CurrRC, AMDGPU::lo16)) {
7260+
const DebugLoc &DL = MI.getDebugLoc();
7261+
Register NewDstReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7262+
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
7263+
BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
7264+
BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
7265+
.addReg(Op.getReg())
7266+
.addImm(AMDGPU::lo16)
7267+
.addReg(Undef)
7268+
.addImm(AMDGPU::hi16);
7269+
Op.setReg(NewDstReg);
7270+
}
7271+
}
7272+
void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
7273+
MachineRegisterInfo &MRI) const {
7274+
for (unsigned OpIdx = 1; OpIdx < MI.getNumExplicitOperands(); OpIdx++)
7275+
legalizeOperandsVALUt16(MI, OpIdx, MRI);
72517276
}
72527277

72537278
void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
@@ -7769,15 +7794,14 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77697794
return;
77707795
}
77717796

7772-
// If this is a v2s copy src from 16bit to 32bit,
7773-
// replace vgpr copy to reg_sequence
7797+
// If this is a v2s copy between 16bit and 32bit reg,
7798+
// replace vgpr copy to reg_sequence/extract_subreg
77747799
// This can be remove after we have sgpr16 in place
77757800
if (ST.useRealTrue16Insts() && Inst.isCopy() &&
77767801
Inst.getOperand(1).getReg().isVirtual() &&
77777802
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
77787803
const TargetRegisterClass *SrcRegRC = getOpRegClass(Inst, 1);
7779-
if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
7780-
32 == RI.getRegSizeInBits(*NewDstRC)) {
7804+
if (RI.getMatchingSuperRegClass(NewDstRC, SrcRegRC, AMDGPU::lo16)) {
77817805
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
77827806
Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
77837807
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
@@ -7789,7 +7813,13 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
77897813
.addReg(Undef)
77907814
.addImm(AMDGPU::hi16);
77917815
Inst.eraseFromParent();
7792-
7816+
MRI.replaceRegWith(DstReg, NewDstReg);
7817+
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7818+
return;
7819+
} else if (RI.getMatchingSuperRegClass(SrcRegRC, NewDstRC,
7820+
AMDGPU::lo16)) {
7821+
Inst.getOperand(1).setSubReg(AMDGPU::lo16);
7822+
Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
77937823
MRI.replaceRegWith(DstReg, NewDstReg);
77947824
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
77957825
return;
@@ -7885,23 +7915,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
78857915
assert(NewDstRC);
78867916
NewDstReg = MRI.createVirtualRegister(NewDstRC);
78877917
MRI.replaceRegWith(DstReg, NewDstReg);
7888-
7889-
// Check useMI of NewInstr. If used by a true16 instruction,
7890-
// add a lo16 subreg access if size mismatched
7891-
// This can be remove after we have sgpr16 in place
7892-
if (ST.useRealTrue16Insts() && NewDstRC == &AMDGPU::VGPR_32RegClass) {
7893-
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
7894-
E = MRI.use_end();
7895-
I != E; ++I) {
7896-
MachineInstr &UseMI = *I->getParent();
7897-
unsigned UseMIOpcode = UseMI.getOpcode();
7898-
if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
7899-
(16 ==
7900-
RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
7901-
I->setSubReg(AMDGPU::lo16);
7902-
}
7903-
}
7904-
}
79057918
}
79067919
fixImplicitOperands(*NewInstr);
79077920

@@ -8709,6 +8722,8 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
87098722
++I;
87108723
} while (I != E && I->getParent() == &UseMI);
87118724
} else {
8725+
legalizeOperandsVALUt16(UseMI, OpNo, MRI);
8726+
87128727
++I;
87138728
}
87148729
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1304,6 +1304,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
13041304
/// Fix operands in Inst to fix 16bit SALU to VALU lowering.
13051305
void legalizeOperandsVALUt16(MachineInstr &Inst,
13061306
MachineRegisterInfo &MRI) const;
1307+
void legalizeOperandsVALUt16(MachineInstr &Inst, unsigned OpIdx,
1308+
MachineRegisterInfo &MRI) const;
13071309

13081310
/// Replace the instructions opcode with the equivalent VALU
13091311
/// opcode. This function will also move the users of MachineInstruntions

0 commit comments

Comments
 (0)