-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Swap V_CNDMASK operands to shrink it into VOP2 #135162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Ana Mihajlovic (mihajlovicana) ChangesSwitch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask. Patch is 217.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135162.diff 25 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..0b1638c25b9ae 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -51,6 +51,11 @@ class SIShrinkInstructions {
unsigned SubReg) const;
bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
unsigned SubReg) const;
+ bool trySwitchOperands(MachineInstr &MI, Register *OldVCC,
+ Register *NewVCC) const;
+ bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
+ const SIInstrInfo &TII) const;
+ unsigned getInverseCompareOpcode(MachineInstr &MI) const;
TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
unsigned I) const;
void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
@@ -831,6 +836,109 @@ bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
return true;
}
+unsigned SIShrinkInstructions::getInverseCompareOpcode(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ // unsigned 32
+ case AMDGPU::V_CMP_EQ_U32_e64:
+ return AMDGPU::V_CMP_NE_U32_e64;
+ case AMDGPU::V_CMP_NE_U32_e64:
+ return AMDGPU::V_CMP_EQ_U32_e64;
+ case AMDGPU::V_CMP_GE_U32_e64:
+ return AMDGPU::V_CMP_LT_U32_e64;
+ case AMDGPU::V_CMP_LE_U32_e64:
+ return AMDGPU::V_CMP_GT_U32_e64;
+ case AMDGPU::V_CMP_GT_U32_e64:
+ return AMDGPU::V_CMP_LE_U32_e64;
+ case AMDGPU::V_CMP_LT_U32_e64:
+ return AMDGPU::V_CMP_GE_U32_e64;
+ // float 32
+ case AMDGPU::V_CMP_EQ_F32_e64:
+ return AMDGPU::V_CMP_NEQ_F32_e64;
+ case AMDGPU::V_CMP_NEQ_F32_e64:
+ return AMDGPU::V_CMP_EQ_F32_e64;
+ case AMDGPU::V_CMP_GE_F32_e64:
+ return AMDGPU::V_CMP_LT_F32_e64;
+ case AMDGPU::V_CMP_LE_F32_e64:
+ return AMDGPU::V_CMP_GT_F32_e64;
+ case AMDGPU::V_CMP_GT_F32_e64:
+ return AMDGPU::V_CMP_LE_F32_e64;
+ case AMDGPU::V_CMP_LT_F32_e64:
+ return AMDGPU::V_CMP_GE_F32_e64;
+ default:
+ return 0;
+ }
+}
+
+bool SIShrinkInstructions::shouldSwitchOperands(MachineRegisterInfo &MRI,
+ MachineInstr &MI,
+ const SIInstrInfo &TII) const {
+ auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
+ unsigned Count = 0;
+
+ for (auto &Use : allUses) {
+ if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+ return false;
+ MachineOperand *Src0 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
+ MachineOperand *Src1 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+
+ auto Src0Imm = Src0->isImm();
+ auto Src1Imm = Src1->isImm();
+
+ if (!Src1Imm && Src0Imm)
+ return false;
+ if (Src1Imm && !Src0Imm)
+ Count++;
+ }
+ return (Count >= 1);
+}
+
+// OldVCC and NewVCC are used to remember VCC after inverting comparison
+bool SIShrinkInstructions::trySwitchOperands(MachineInstr &MI, Register *OldVCC,
+ Register *NewVCC) const {
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Reg = MI.getOperand(5).getReg();
+ if (!Reg.isVirtual())
+ return false;
+
+ if (*OldVCC != Reg) {
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ if (DefMI) {
+ unsigned Opcode = getInverseCompareOpcode(*DefMI);
+ if (Opcode &&
+ SIShrinkInstructions::shouldSwitchOperands(*MRI, MI, *TII)) {
+ auto cmpDL = DefMI->getDebugLoc();
+ *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ *OldVCC = Reg;
+ MachineInstrBuilder InverseCompare = BuildMI(
+ *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
+ InverseCompare->setFlags(DefMI->getFlags());
+
+ unsigned OpNum = DefMI->getNumExplicitOperands();
+ for (unsigned i = 1; i < OpNum; i++) {
+ MachineOperand Op = DefMI->getOperand(i);
+ InverseCompare.add(Op);
+ if (Op.isReg() && Op.isKill())
+ InverseCompare->getOperand(i).setIsKill(false);
+ }
+ }
+ }
+ }
+ if (*OldVCC == Reg) {
+ BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .addReg(*NewVCC);
+ MI.eraseFromParent();
+ return true;
+ }
+ return false;
+}
+
bool SIShrinkInstructions::run(MachineFunction &MF) {
this->MF = &MF;
@@ -842,6 +950,8 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
std::vector<unsigned> I1Defs;
+ Register OldVCC = AMDGPU::NoRegister;
+ Register NewVCC = AMDGPU::NoRegister;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
@@ -973,6 +1083,10 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
continue;
}
+ if (MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
+ trySwitchOperands(MI, &OldVCC, &NewVCC))
+ MRI->setRegAllocationHint(NewVCC, 0, VCCReg);
+
// If there is no chance we will shrink it and use VCC as sdst to get
// a 32 bit form try to replace dead sdst with NULL.
if (TII->isVOP3(MI.getOpcode())) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 31a229a908142..8921950af5f8b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -13,9 +13,9 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
; GCN-NEXT: v_cndmask_b32_e64 v6, 1.0, 2.0, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0x40c00000
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -34,18 +34,18 @@ define float @dyn_extract_v8f32_const_s_v(i32 %sel) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
-; GFX10PLUS-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v1, 0x41000000, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 2, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, 0x40400000, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 4, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, 0x40a00000, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 5, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, 0x40c00000, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 6, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, 0x40e00000, v1, vcc_lo
+; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 7, v0
+; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, 0x41000000, v1, vcc_lo
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
@@ -3385,9 +3385,9 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
; GCN-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0x40a00000
-; GCN-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0
; GCN-NEXT: v_mov_b32_e32 v3, 0x40c00000
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -3429,32 +3429,32 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX10-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 2, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x40400000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 4, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x40a00000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 5, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x40c00000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 6, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x40e00000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 7, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 8, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41100000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 9, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41200000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 10, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41300000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 11, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41400000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 12, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41500000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 13, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41600000, v1, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 14, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x41700000, v1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -3464,32 +3464,32 @@ define float @dyn_extract_v15f32_const_s_v(i32 %sel) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40400000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40a00000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 5, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40c00000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 6, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x40e00000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 7, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41000000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 8, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41100000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 9, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41200000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 10, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41300000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 11, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41400000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 12, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41500000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 13, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41600000, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 14, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0x41700000, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 2, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x40400000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 4, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x40a00000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 5, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x40c00000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 6, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x40e00000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 7, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41000000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 8, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41100000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 9, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41200000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 10, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41300000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 11, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41400000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 12, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41500000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 13, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41600000, v1, vcc_lo
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 14, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x41700000, v1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 15, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
; GFX11-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index 723ad5646c0a3..03b713f6866a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -5155,8 +5155,8 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX8-NEXT: s_and_b32 s0, 1, s2
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, 0, s0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5202,8 +5202,8 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: s_and_b32 s0, 1, s2
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, 0, s0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5241,16 +5241,16 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX10-NEXT: s_and_b32 s0, 1, s10
; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX10-NEXT: s_cselect_b32 s1, 1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, 0, s1
; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5282,16 +5282,15 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-NEXT: s_and_b32 s0, 1, s10
; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
-; GFX11-NEXT: s_cselect_b32 s1, 1, 0
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT: s_cselect_b32 s1, 1, 0
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
-; GFX11-NEXT: v_mov_b32_e32 v2, s5
; GFX11-NEXT: s_ashr_i32 s0, s9, 31
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT: v_cmp_eq_u32_e64 vcc_lo, 0, s1
; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
+; GFX11-NEXT: v_dual_cndmask_b32 v1, 0, v2 :: v_dual_mov_b32 v2, s5
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5511,8 +5510,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX8-NEXT: s_and_b32 s0, 1, s4
-; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, 0, s0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX8-NEXT: v_bfrev_b32_e32 v1, 1
@@ -5545,8 +5544,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: s_and_b32 s0, 1, s4
-; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, 0, s0
+; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5572,13 +5571,13 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0
; GFX10-NEXT: v_cndmas...
[truncated]
|
This is a modified version of : #135007 (code is moved from fold operands pass to shrink instructions pass) |
3cf7cb8
to
5f807e6
Compare
5f807e6
to
7c04a66
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- Needs rebase
- Please add tests for a negative case, case with source modifiers and when shouldSwapCndOperands counter is positive.
should this be added for all compare kinds? |
No, just one is enough. |
6a8df0f
to
41b069e
Compare
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions cpp -- llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp View the diff from clang-format here.diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 9727ffc14..417719b9b 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -966,7 +966,7 @@ bool SIShrinkInstructions::shouldSwapCndOperands(
MachineOperand &Src0 = UseInst.getOperand(2);
MachineOperand &Src1 = UseInst.getOperand(4);
- //if instruction has source modifiers it cannot be converted to VOP2
+ // if instruction has source modifiers it cannot be converted to VOP2
if (UseInst.getOperand(1).getImm() != SISrcMods::NONE ||
UseInst.getOperand(3).getImm() != SISrcMods::NONE)
continue;
@@ -974,7 +974,7 @@ bool SIShrinkInstructions::shouldSwapCndOperands(
bool Src0IsVGPR = Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg());
bool Src1IsVGPR = Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg());
- //Src1 always has to be VGPR in VOP2
+ // Src1 always has to be VGPR in VOP2
if (!Src0IsVGPR && Src1IsVGPR)
InstsToSwap--;
else if (Src0IsVGPR && !Src1IsVGPR)
|
5846d9e
to
6c6a477
Compare
ping |
if (!Src1Imm && Src0Imm) | ||
InstsToSwap--; | ||
else if (Src1Imm && !Src0Imm && | ||
UseInst->getOperand(1).getImm() == SISrcMods::NONE && | ||
TRI->isVGPR(*MRI, Src0.getReg())) | ||
InstsToSwap++; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not accurate when src0 is immediate and src1 is either sgpr or vgpr with source modifiers.
Also it is not accurate when either of the src operands is an sgpr and the other one is a vgpr without source modifiers.
|
||
unsigned OpNum = MI.getNumExplicitOperands(); | ||
for (unsigned Idx = 1; Idx < OpNum; Idx++) { | ||
MachineOperand Op = MI.getOperand(Idx); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unnecessary copy of operand.
|
||
bool SIShrinkInstructions::shouldSwapCndOperands( | ||
MachineInstr &MI, SmallVector<MachineOperand *, 4> &UsesToProcess) const { | ||
auto AllUses = MRI->use_nodbg_operands(MI.getOperand(0).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be use_nodbg_instructions and UsesToProcess can track instructions instead.
} | ||
|
||
bool SIShrinkInstructions::shouldSwapCndOperands( | ||
MachineInstr &MI, SmallVector<MachineOperand *, 4> &UsesToProcess) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This only uses dst reg of MI. You can just pass that as an argument.
a5aaadf
to
25bf58b
Compare
auto AllUses = MRI->use_nodbg_instructions(Reg); | ||
int InstsToSwap = 0; | ||
|
||
for (auto &UseInst : AllUses) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should avoid looking at all uses, should perform the fold from the use cndmask
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Decision to swap operands is done either on all instructions or none, since the original cmp instruction that defines vcc is also changed. You can not really avoid looking at all uses.
Preforming the fold from the use cndmask is inconvenient because you would have to track whether the specific vcc value was already analyzed.
if (Op.isReg() && Op.isKill()) | ||
InverseCompare->getOperand(Idx).setIsKill(false); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can just do a clearKillFlags
InverseCompare->setFlags(MI.getFlags()); | ||
|
||
unsigned OpNum = MI.getNumExplicitOperands(); | ||
for (unsigned Idx = 1; Idx < OpNum; Idx++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)
the number of operands for cmp instructions is not fixed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The loop just makes this more complex, just do a complete buildMI above (or just mutate the instruction in place?)
i can do that but then i have to check for number of operands explicitly
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is fixed, it's just has source modifiers or does not have source modifiers (I guess cmp_class complicates it slightly)
swapCndOperands(*Use); | ||
} | ||
|
||
MRI->replaceRegWith(Reg, NewVCC); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should probably be making a local replacement
…f creating new instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would probably be easier to handle this as a pre-selection combine
InverseCompare->setFlags(MI.getFlags()); | ||
|
||
unsigned OpNum = MI.getNumExplicitOperands(); | ||
for (unsigned Idx = 1; Idx < OpNum; Idx++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is fixed, it's just has source modifiers or does not have source modifiers (I guess cmp_class complicates it slightly)
return AMDGPU::V_CMP_O_F64_e64; | ||
default: | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also handle cmp_class, but we probably should have handled this earlier
i need to check source modifiers to determine if the swap is beneficial , i can't check that in pre-selection combine |
Not directly, no. But you can detect with high confidence if the source modifier patterns appear (in FP or integer form), it's just slightly inconvenient. However, we can just unconditionally prefer to place the constants in src0 in the compare + select pattern so it kind of doesn't matter |
|
||
static void swapCndOperands(MachineInstr &MI) { | ||
MachineOperand &Op2 = MI.getOperand(2); | ||
MachineOperand Op4 = MI.getOperand(4); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we have an assertion to ensure MI contains at least 5 operands before getOperand(4)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, that would just clutter the code. getOperand
already has the assertion in its implementation.
continue; | ||
|
||
bool Src0IsVGPR = Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()); | ||
bool Src1IsVGPR = Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: IsSrc0VGPR and IsSrc1VGPR seems better.
|
Swap the operands in v_cndmask x, y, where y is a constant, to use the vop2 format instead of vop3. This also requires inverting the comparison (instruction generating the vcc that will be used by v_cndmask). Doing so allows for the later merging of these instructions into v_dual_cndmask.