[AMDGPU] Merge two V_CNDMASK instructions into V_DUAL_CNDMASK #135007

mihajlovicana · 2025-04-09T13:42:24Z

Switch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask.

llvmbot · 2025-04-09T13:43:04Z

@llvm/pr-subscribers-llvm-globalisel

Author: Ana Mihajlovic (mihajlovicana)

Changes

Switch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask.

Patch is 44.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135007.diff

11 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+88-3)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+9-9)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+9-9)
(modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+10-10)
(modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+40-40)
(modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+35-34)
(modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+18-18)
(modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+24-23)
(modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+18-18)
(added) llvm/test/CodeGen/AMDGPU/short-select-cndmask.ll (+47)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d6acf9e081b9f..4ad538e0b1e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -105,6 +105,25 @@ class SIFoldOperandsImpl {
     }
   }
 
+  unsigned getInverseCompareOpcode(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_CMP_EQ_U32_e64:
+      return AMDGPU::V_CMP_NE_U32_e64;
+    case AMDGPU::V_CMP_NE_U32_e64:
+      return AMDGPU::V_CMP_EQ_U32_e64;
+    case AMDGPU::V_CMP_GE_U32_e64:
+      return AMDGPU::V_CMP_LT_U32_e64;
+    case AMDGPU::V_CMP_LE_U32_e64:
+      return AMDGPU::V_CMP_GT_U32_e64;
+    case AMDGPU::V_CMP_GT_U32_e64:
+      return AMDGPU::V_CMP_LE_U32_e64;
+    case AMDGPU::V_CMP_LT_U32_e64:
+      return AMDGPU::V_CMP_GE_U32_e64;
+    default:
+      return 0;
+    }
+  }
+
   bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
                                              MachineInstr &MI) const;
 
@@ -133,7 +152,8 @@ class SIFoldOperandsImpl {
 
   std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
-  bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+                      Register *newVCC) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -152,6 +172,9 @@ class SIFoldOperandsImpl {
 
   bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
 
+  bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
+                            const SIInstrInfo &TII) const;
+
 public:
   SIFoldOperandsImpl() = default;
 
@@ -1459,13 +1482,73 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
   return false;
 }
 
+bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
+                                              MachineInstr &MI,
+                                              const SIInstrInfo &TII) const {
+  auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
+  unsigned count = 0;
+
+  for (auto &Use : allUses) {
+    if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+      return false;
+    MachineOperand *Src0 =
+        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
+    MachineOperand *Src1 =
+        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+
+    auto src0Imm = getImmOrMaterializedImm(*Src0);
+    auto src1Imm = getImmOrMaterializedImm(*Src1);
+
+    if (!src1Imm && src0Imm)
+      return false;
+    if (src1Imm && !src0Imm)
+      count++;
+  }
+  return (count >= 2);
+}
+
 // Try to fold an instruction into a simpler one
-bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
+bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+                                        Register *NewVCC) const {
   unsigned Opc = MI.getOpcode();
   if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
       Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
     return false;
 
+  if (Opc == AMDGPU::V_CNDMASK_B32_e64) {
+    const DebugLoc &DL = MI.getDebugLoc();
+    auto Reg = MI.getOperand(5).getReg();
+
+    if (*RegVCC != Reg) {
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI) {
+        unsigned Opcode = getInverseCompareOpcode(*DefMI);
+        if (Opcode &&
+            SIFoldOperandsImpl::shouldSwitchOperands(*MRI, MI, *TII)) {
+          auto cmpDL = DefMI->getDebugLoc();
+          *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+          *RegVCC = Reg;
+          MachineInstrBuilder inverseCompare = BuildMI(
+              *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
+
+          inverseCompare.add(DefMI->getOperand(1));
+          inverseCompare.add(DefMI->getOperand(2));
+        }
+      }
+    }
+    if (*RegVCC == Reg) {
+      BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
+              MI.getOperand(0).getReg())
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(4))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(2))
+          .addReg(*NewVCC);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   if (!Src1->isIdenticalTo(*Src0)) {
@@ -2533,10 +2616,12 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
   bool Changed = false;
+  Register Reg = 0;
+  Register newVCC = 0;
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineOperand *CurrentKnownM0Val = nullptr;
     for (auto &MI : make_early_inc_range(*MBB)) {
-      Changed |= tryFoldCndMask(MI);
+      Changed |= tryFoldCndMask(MI, &Reg, &newVCC);
 
       if (tryFoldZeroHighBits(MI)) {
         Changed = true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..536504747c971 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2835,9 +2835,9 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i48:
@@ -2944,10 +2944,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3003,10 +3003,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..1944d1577ae29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2705,9 +2705,9 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i48:
@@ -2815,9 +2815,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_sv:
@@ -2873,9 +2873,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..efd633d21dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1287,11 +1287,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_xor_b32_e32 v6, 0x7f, v0
 ; GFX9-G-NEXT:    v_or_b32_e32 v14, v6, v2
 ; GFX9-G-NEXT:    v_and_b32_e32 v6, 1, v20
-; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, v9, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v12, v10, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v12, 0, v10, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v13, 0, v11, vcc
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v14, v20, v14
@@ -3414,11 +3414,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_xor_b32_e32 v8, 0x7f, v12
 ; GFX9-G-NEXT:    v_or_b32_e32 v16, v8, v14
 ; GFX9-G-NEXT:    v_and_b32_e32 v8, 1, v18
-; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v8, v2, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v8, 0, v2, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v9, 0, v3, vcc
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v16, v18, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..07d7276e3b944 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -495,13 +495,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
 ; GISEL-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GISEL-NEXT:    v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v22, v18, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v18, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v20, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v20, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v21, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v23, v19, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v19, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -685,12 +685,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v11, v14, v15
 ; GISEL-NEXT:    v_and_b32_e32 v14, 1, v11
 ; GISEL-NEXT:    v_or_b32_e32 v10, v11, v10
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v6, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v16, 1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, 0, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1251,13 +1251,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v16, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v1, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -1423,12 +1423,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v9, v20, v10
 ; GISEL-NEXT:    v_and_b32_e32 v10, 1, v9
 ; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v5, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2093,13 +2093,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
 ; GISEL-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GISEL-NEXT:    v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v31, 0, v16, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v8, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v17, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -2283,12 +2283,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v3, v20, v21
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v3
 ; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v12, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v22, 1, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, v13, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2920,13 +2920,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v20, v21, v20
 ; GISEL-NEXT:    v_and_b32_e32 v21, 1, v21
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
-; GISEL-NEXT:    v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v2, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v3, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v33, 0, v1, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -3092,12 +3092,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v19, v26, v24
 ; GISEL-NEXT:    v_and_b32_e32 v24, 1, v19
 ; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
-; GISEL-NEXT:    v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v4, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v26, 1, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v25, v5, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 10de973dac0c5..cd1426f868bce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1282,10 +1282,10 @@ define double @double16_extelt_vec(i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GCN-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40301999
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 14f7cbcd0f438..1b471166b5d29 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2836,9 +2836,9 @@ define float @v_fneg_select_infloop_regression_f32(float %a...
[truncated]

llvmbot · 2025-04-09T13:43:06Z

@llvm/pr-subscribers-backend-amdgpu

Author: Ana Mihajlovic (mihajlovicana)

Changes

Switch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask.

Patch is 44.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135007.diff

11 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+88-3)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+9-9)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+9-9)
(modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+10-10)
(modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+40-40)
(modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll (+35-34)
(modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+18-18)
(modified) llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll (+24-23)
(modified) llvm/test/CodeGen/AMDGPU/itofp.i128.ll (+18-18)
(added) llvm/test/CodeGen/AMDGPU/short-select-cndmask.ll (+47)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d6acf9e081b9f..4ad538e0b1e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -105,6 +105,25 @@ class SIFoldOperandsImpl {
     }
   }
 
+  unsigned getInverseCompareOpcode(MachineInstr &MI) const {
+    switch (MI.getOpcode()) {
+    case AMDGPU::V_CMP_EQ_U32_e64:
+      return AMDGPU::V_CMP_NE_U32_e64;
+    case AMDGPU::V_CMP_NE_U32_e64:
+      return AMDGPU::V_CMP_EQ_U32_e64;
+    case AMDGPU::V_CMP_GE_U32_e64:
+      return AMDGPU::V_CMP_LT_U32_e64;
+    case AMDGPU::V_CMP_LE_U32_e64:
+      return AMDGPU::V_CMP_GT_U32_e64;
+    case AMDGPU::V_CMP_GT_U32_e64:
+      return AMDGPU::V_CMP_LE_U32_e64;
+    case AMDGPU::V_CMP_LT_U32_e64:
+      return AMDGPU::V_CMP_GE_U32_e64;
+    default:
+      return 0;
+    }
+  }
+
   bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
                                              MachineInstr &MI) const;
 
@@ -133,7 +152,8 @@ class SIFoldOperandsImpl {
 
   std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
   bool tryConstantFoldOp(MachineInstr *MI) const;
-  bool tryFoldCndMask(MachineInstr &MI) const;
+  bool tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+                      Register *newVCC) const;
   bool tryFoldZeroHighBits(MachineInstr &MI) const;
   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
@@ -152,6 +172,9 @@ class SIFoldOperandsImpl {
 
   bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
 
+  bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
+                            const SIInstrInfo &TII) const;
+
 public:
   SIFoldOperandsImpl() = default;
 
@@ -1459,13 +1482,73 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
   return false;
 }
 
+bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
+                                              MachineInstr &MI,
+                                              const SIInstrInfo &TII) const {
+  auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
+  unsigned count = 0;
+
+  for (auto &Use : allUses) {
+    if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+      return false;
+    MachineOperand *Src0 =
+        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
+    MachineOperand *Src1 =
+        TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+
+    auto src0Imm = getImmOrMaterializedImm(*Src0);
+    auto src1Imm = getImmOrMaterializedImm(*Src1);
+
+    if (!src1Imm && src0Imm)
+      return false;
+    if (src1Imm && !src0Imm)
+      count++;
+  }
+  return (count >= 2);
+}
+
 // Try to fold an instruction into a simpler one
-bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
+bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+                                        Register *NewVCC) const {
   unsigned Opc = MI.getOpcode();
   if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
       Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
     return false;
 
+  if (Opc == AMDGPU::V_CNDMASK_B32_e64) {
+    const DebugLoc &DL = MI.getDebugLoc();
+    auto Reg = MI.getOperand(5).getReg();
+
+    if (*RegVCC != Reg) {
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI) {
+        unsigned Opcode = getInverseCompareOpcode(*DefMI);
+        if (Opcode &&
+            SIFoldOperandsImpl::shouldSwitchOperands(*MRI, MI, *TII)) {
+          auto cmpDL = DefMI->getDebugLoc();
+          *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+          *RegVCC = Reg;
+          MachineInstrBuilder inverseCompare = BuildMI(
+              *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
+
+          inverseCompare.add(DefMI->getOperand(1));
+          inverseCompare.add(DefMI->getOperand(2));
+        }
+      }
+    }
+    if (*RegVCC == Reg) {
+      BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
+              MI.getOperand(0).getReg())
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(4))
+          .add(MI.getOperand(1))
+          .add(MI.getOperand(2))
+          .addReg(*NewVCC);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
   if (!Src1->isIdenticalTo(*Src0)) {
@@ -2533,10 +2616,12 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
 
   bool Changed = false;
+  Register Reg = 0;
+  Register newVCC = 0;
   for (MachineBasicBlock *MBB : depth_first(&MF)) {
     MachineOperand *CurrentKnownM0Val = nullptr;
     for (auto &MI : make_early_inc_range(*MBB)) {
-      Changed |= tryFoldCndMask(MI);
+      Changed |= tryFoldCndMask(MI, &Reg, &newVCC);
 
       if (tryFoldZeroHighBits(MI)) {
         Changed = true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..536504747c971 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2835,9 +2835,9 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_uaddsat_i48:
@@ -2944,10 +2944,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3003,10 +3003,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..1944d1577ae29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2705,9 +2705,9 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_usubsat_i48:
@@ -2815,9 +2815,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_sv:
@@ -2873,9 +2873,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: usubsat_i48_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..efd633d21dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1287,11 +1287,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_xor_b32_e32 v6, 0x7f, v0
 ; GFX9-G-NEXT:    v_or_b32_e32 v14, v6, v2
 ; GFX9-G-NEXT:    v_and_b32_e32 v6, 1, v20
-; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v7, v9, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v12, v10, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v12, 0, v10, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v13, 0, v11, vcc
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v14, v20, v14
@@ -3414,11 +3414,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-NEXT:    v_xor_b32_e32 v8, 0x7f, v12
 ; GFX9-G-NEXT:    v_or_b32_e32 v16, v8, v14
 ; GFX9-G-NEXT:    v_and_b32_e32 v8, 1, v18
-; GFX9-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v8, v2, 0, vcc
-; GFX9-G-NEXT:    v_cndmask_b32_e64 v9, v3, 0, vcc
+; GFX9-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v8, 0, v2, vcc
+; GFX9-G-NEXT:    v_cndmask_b32_e32 v9, 0, v3, vcc
 ; GFX9-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
 ; GFX9-G-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GFX9-G-NEXT:    v_or_b32_e32 v16, v18, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..07d7276e3b944 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -495,13 +495,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
 ; GISEL-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GISEL-NEXT:    v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v22, v18, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v18, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v20, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v20, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v21, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v23, v19, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v19, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB0_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -685,12 +685,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v11, v14, v15
 ; GISEL-NEXT:    v_and_b32_e32 v14, 1, v11
 ; GISEL-NEXT:    v_or_b32_e32 v10, v11, v10
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v6, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v16, 1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, 0, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1251,13 +1251,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GISEL-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v16, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v17, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v1, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB1_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -1423,12 +1423,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v9, v20, v10
 ; GISEL-NEXT:    v_and_b32_e32 v10, 1, v9
 ; GISEL-NEXT:    v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v4, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v5, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2093,13 +2093,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
 ; GISEL-NEXT:    v_and_b32_e32 v19, 1, v19
 ; GISEL-NEXT:    v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v31, 0, v16, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v8, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v9, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v17, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -2283,12 +2283,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v3, v20, v21
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v3
 ; GISEL-NEXT:    v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v12, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v22, 1, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, v13, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v13, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2920,13 +2920,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v20, v21, v20
 ; GISEL-NEXT:    v_and_b32_e32 v21, 1, v21
 ; GISEL-NEXT:    v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v21
-; GISEL-NEXT:    v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v21
+; GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v2, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, 0, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v21, 0, v3, vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT:    v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v33, 0, v1, vcc
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
 ; GISEL-NEXT:    s_cbranch_execz .LBB3_6
 ; GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
@@ -3092,12 +3092,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_or_b32_e32 v19, v26, v24
 ; GISEL-NEXT:    v_and_b32_e32 v24, 1, v19
 ; GISEL-NEXT:    v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
-; GISEL-NEXT:    v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v4, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v26, 1, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v25, v5, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, v6, 0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v18, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v26
 ; GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 10de973dac0c5..cd1426f868bce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1282,10 +1282,10 @@ define double @double16_extelt_vec(i32 %sel) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GCN-NEXT:    s_or_b64 vcc, vcc, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v0
-; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
+; GCN-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40301999
-; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 14f7cbcd0f438..1b471166b5d29 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2836,9 +2836,9 @@ define float @v_fneg_select_infloop_regression_f32(float %a...
[truncated]

jayfoad · 2025-04-09T13:53:04Z

for using vop2 instead of vop3 format

Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there?

jayfoad · 2025-04-09T13:54:57Z

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+                                        Register *NewVCC) const {


I don't understand how the RegVCC and NewVCC arguments are used. Please add a comment to explain.

used to remember vcc and new inverted vcc so that shouldSwitchOperands check is called only once.

jayfoad · 2025-04-09T13:56:01Z

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
+                                              MachineInstr &MI,
+                                              const SIInstrInfo &TII) const {
+  auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());


All variable names should start with an upper case letter.

jayfoad · 2025-04-09T13:57:38Z

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+    if (src1Imm && !src0Imm)
+      count++;
+  }
+  return (count >= 2);


It is probably still worth doing this if count == 1, to reduce code size.

jayfoad · 2025-04-09T13:59:58Z

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+      return AMDGPU::V_CMP_LE_U32_e64;
+    case AMDGPU::V_CMP_LT_U32_e64:
+      return AMDGPU::V_CMP_GE_U32_e64;
+    default:


Should also handle floating point comparisons?

mihajlovicana · 2025-04-09T14:03:25Z

for using vop2 instead of vop3 format

Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there?

I wasn't sure if that would be proper because I am changing not only v_cndmask but also cmp instructions, but correct me if i'm wrong

mbrkusanin · 2025-04-09T14:50:01Z

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

+    if (src1Imm && !src0Imm)
+      count++;


If src0 has source modifiers then swapping operands will not help with turning this instruction into vop2.

should that be added to check ?

jayfoad · 2025-04-09T14:50:35Z

for using vop2 instead of vop3 format

Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there?

I wasn't sure if that would be proper because I am changing not only v_cndmask but also cmp instructions, but correct me if i'm wrong

I think it is OK to do it in SIShrinkInstructions. It does not have stricter rules than SIFoldOperands.

mihajlovicana · 2025-04-09T15:50:53Z

for using vop2 instead of vop3 format

Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there?

I wasn't sure if that would be proper because I am changing not only v_cndmask but also cmp instructions, but correct me if i'm wrong

I think it is OK to do it in SIShrinkInstructions. It does not have stricter rules than SIFoldOperands.

okay, will try and migrate that there

mihajlovicana · 2025-04-10T11:30:02Z

for using vop2 instead of vop3 format

Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there?

I wasn't sure if that would be proper because I am changing not only v_cndmask but also cmp instructions, but correct me if i'm wrong

I think it is OK to do it in SIShrinkInstructions. It does not have stricter rules than SIFoldOperands.

I submitted this as a separate PR because I found it cleaner : #135162

arsenm · 2025-04-13T10:20:22Z

I think it is OK to do it in SIShrinkInstructions. It does not have stricter rules than SIFoldOperands.

It kind of does, it needs to support SSA and non-SSA

mihajlovicana · 2025-04-15T16:29:50Z

Moved to #135162

mihajlovicana added 2 commits April 9, 2025 15:24

precommit

51b12c3

[AMDGPU] Merge V_CNDMASKS into V_DUAL_CNDMASK

b2f1080

llvmbot added backend:AMDGPU llvm:globalisel labels Apr 9, 2025

jayfoad reviewed Apr 9, 2025

View reviewed changes

mbrkusanin reviewed Apr 9, 2025

View reviewed changes

mihajlovicana added 2 commits April 9, 2025 17:08

cover case for single v_cndmask to reduce code size

e7e3462

added float instructions

3d1ae88

mihajlovicana mentioned this pull request Apr 10, 2025

[AMDGPU] Swap V_CNDMASK operands to shrink it into VOP2 #135162

Open

mihajlovicana closed this Apr 15, 2025

		bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
		Register *NewVCC) const {

[AMDGPU] Merge two V_CNDMASK instructions into V_DUAL_CNDMASK #135007

[AMDGPU] Merge two V_CNDMASK instructions into V_DUAL_CNDMASK #135007

Uh oh!

Conversation

mihajlovicana commented Apr 9, 2025

Uh oh!

llvmbot commented Apr 9, 2025

Uh oh!

llvmbot commented Apr 9, 2025

Uh oh!

jayfoad commented Apr 9, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

mihajlovicana commented Apr 9, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jayfoad commented Apr 9, 2025

Uh oh!

mihajlovicana commented Apr 9, 2025

Uh oh!

mihajlovicana commented Apr 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Apr 13, 2025

Uh oh!

mihajlovicana commented Apr 15, 2025

Uh oh!

Uh oh!

mihajlovicana commented Apr 9, 2025 •

edited

Loading

mihajlovicana commented Apr 10, 2025 •

edited

Loading