-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AMDGPU] Merge two V_CNDMASK instructions into V_DUAL_CNDMASK #135007
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-globalisel Author: Ana Mihajlovic (mihajlovicana) ChangesSwitch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask. Patch is 44.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135007.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d6acf9e081b9f..4ad538e0b1e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -105,6 +105,25 @@ class SIFoldOperandsImpl {
}
}
+ unsigned getInverseCompareOpcode(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_CMP_EQ_U32_e64:
+ return AMDGPU::V_CMP_NE_U32_e64;
+ case AMDGPU::V_CMP_NE_U32_e64:
+ return AMDGPU::V_CMP_EQ_U32_e64;
+ case AMDGPU::V_CMP_GE_U32_e64:
+ return AMDGPU::V_CMP_LT_U32_e64;
+ case AMDGPU::V_CMP_LE_U32_e64:
+ return AMDGPU::V_CMP_GT_U32_e64;
+ case AMDGPU::V_CMP_GT_U32_e64:
+ return AMDGPU::V_CMP_LE_U32_e64;
+ case AMDGPU::V_CMP_LT_U32_e64:
+ return AMDGPU::V_CMP_GE_U32_e64;
+ default:
+ return 0;
+ }
+ }
+
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
MachineInstr &MI) const;
@@ -133,7 +152,8 @@ class SIFoldOperandsImpl {
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
- bool tryFoldCndMask(MachineInstr &MI) const;
+ bool tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+ Register *newVCC) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
@@ -152,6 +172,9 @@ class SIFoldOperandsImpl {
bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
+ bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
+ const SIInstrInfo &TII) const;
+
public:
SIFoldOperandsImpl() = default;
@@ -1459,13 +1482,73 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;
}
+bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
+ MachineInstr &MI,
+ const SIInstrInfo &TII) const {
+ auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
+ unsigned count = 0;
+
+ for (auto &Use : allUses) {
+ if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+ return false;
+ MachineOperand *Src0 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
+ MachineOperand *Src1 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+
+ auto src0Imm = getImmOrMaterializedImm(*Src0);
+ auto src1Imm = getImmOrMaterializedImm(*Src1);
+
+ if (!src1Imm && src0Imm)
+ return false;
+ if (src1Imm && !src0Imm)
+ count++;
+ }
+ return (count >= 2);
+}
+
// Try to fold an instruction into a simpler one
-bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
+bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+ Register *NewVCC) const {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
return false;
+ if (Opc == AMDGPU::V_CNDMASK_B32_e64) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Reg = MI.getOperand(5).getReg();
+
+ if (*RegVCC != Reg) {
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ if (DefMI) {
+ unsigned Opcode = getInverseCompareOpcode(*DefMI);
+ if (Opcode &&
+ SIFoldOperandsImpl::shouldSwitchOperands(*MRI, MI, *TII)) {
+ auto cmpDL = DefMI->getDebugLoc();
+ *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ *RegVCC = Reg;
+ MachineInstrBuilder inverseCompare = BuildMI(
+ *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
+
+ inverseCompare.add(DefMI->getOperand(1));
+ inverseCompare.add(DefMI->getOperand(2));
+ }
+ }
+ }
+ if (*RegVCC == Reg) {
+ BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .addReg(*NewVCC);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
@@ -2533,10 +2616,12 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
bool Changed = false;
+ Register Reg = 0;
+ Register newVCC = 0;
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineOperand *CurrentKnownM0Val = nullptr;
for (auto &MI : make_early_inc_range(*MBB)) {
- Changed |= tryFoldCndMask(MI);
+ Changed |= tryFoldCndMask(MI, &Reg, &newVCC);
if (tryFoldZeroHighBits(MI)) {
Changed = true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..536504747c971 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2835,9 +2835,9 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_i48:
@@ -2944,10 +2944,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
@@ -3003,10 +3003,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..1944d1577ae29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2705,9 +2705,9 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i48:
@@ -2815,9 +2815,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_sv:
@@ -2873,9 +2873,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..efd633d21dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1287,11 +1287,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0
; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2
; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20
-; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v12, 0, v10, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
@@ -3414,11 +3414,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12
; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14
; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18
-; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v8, 0, v2, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..07d7276e3b944 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -495,13 +495,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
; GISEL-NEXT: v_and_b32_e32 v9, 1, v9
; GISEL-NEXT: v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v18, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v21, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v19, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB0_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -685,12 +685,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v6, vcc
; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1251,13 +1251,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v1, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB1_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -1423,12 +1423,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2093,13 +2093,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
; GISEL-NEXT: v_and_b32_e32 v19, 1, v19
; GISEL-NEXT: v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v16, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v17, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB2_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -2283,12 +2283,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v3, v20, v21
; GISEL-NEXT: v_and_b32_e32 v20, 1, v3
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v12, vcc
; GISEL-NEXT: v_and_b32_e32 v22, 1, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2920,13 +2920,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
; GISEL-NEXT: v_and_b32_e32 v21, 1, v21
; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
+; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v3, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v33, 0, v1, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB3_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -3092,12 +3092,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v19, v26, v24
; GISEL-NEXT: v_and_b32_e32 v24, 1, v19
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v4, vcc
; GISEL-NEXT: v_and_b32_e32 v26, 1, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 10de973dac0c5..cd1426f868bce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1282,10 +1282,10 @@ define double @double16_extelt_vec(i32 %sel) {
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 14f7cbcd0f438..1b471166b5d29 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2836,9 +2836,9 @@ define float @v_fneg_select_infloop_regression_f32(float %a...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Ana Mihajlovic (mihajlovicana) ChangesSwitch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask. Patch is 44.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135007.diff 11 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d6acf9e081b9f..4ad538e0b1e5f 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -105,6 +105,25 @@ class SIFoldOperandsImpl {
}
}
+ unsigned getInverseCompareOpcode(MachineInstr &MI) const {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_CMP_EQ_U32_e64:
+ return AMDGPU::V_CMP_NE_U32_e64;
+ case AMDGPU::V_CMP_NE_U32_e64:
+ return AMDGPU::V_CMP_EQ_U32_e64;
+ case AMDGPU::V_CMP_GE_U32_e64:
+ return AMDGPU::V_CMP_LT_U32_e64;
+ case AMDGPU::V_CMP_LE_U32_e64:
+ return AMDGPU::V_CMP_GT_U32_e64;
+ case AMDGPU::V_CMP_GT_U32_e64:
+ return AMDGPU::V_CMP_LE_U32_e64;
+ case AMDGPU::V_CMP_LT_U32_e64:
+ return AMDGPU::V_CMP_GE_U32_e64;
+ default:
+ return 0;
+ }
+ }
+
bool foldCopyToVGPROfScalarAddOfFrameIndex(Register DstReg, Register SrcReg,
MachineInstr &MI) const;
@@ -133,7 +152,8 @@ class SIFoldOperandsImpl {
std::optional<int64_t> getImmOrMaterializedImm(MachineOperand &Op) const;
bool tryConstantFoldOp(MachineInstr *MI) const;
- bool tryFoldCndMask(MachineInstr &MI) const;
+ bool tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+ Register *newVCC) const;
bool tryFoldZeroHighBits(MachineInstr &MI) const;
bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
@@ -152,6 +172,9 @@ class SIFoldOperandsImpl {
bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
+ bool shouldSwitchOperands(MachineRegisterInfo &MRI, MachineInstr &MI,
+ const SIInstrInfo &TII) const;
+
public:
SIFoldOperandsImpl() = default;
@@ -1459,13 +1482,73 @@ bool SIFoldOperandsImpl::tryConstantFoldOp(MachineInstr *MI) const {
return false;
}
+bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI,
+ MachineInstr &MI,
+ const SIInstrInfo &TII) const {
+ auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg());
+ unsigned count = 0;
+
+ for (auto &Use : allUses) {
+ if (Use.getParent()->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
+ return false;
+ MachineOperand *Src0 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src0);
+ MachineOperand *Src1 =
+ TII.getNamedOperand(*Use.getParent(), AMDGPU::OpName::src1);
+
+ auto src0Imm = getImmOrMaterializedImm(*Src0);
+ auto src1Imm = getImmOrMaterializedImm(*Src1);
+
+ if (!src1Imm && src0Imm)
+ return false;
+ if (src1Imm && !src0Imm)
+ count++;
+ }
+ return (count >= 2);
+}
+
// Try to fold an instruction into a simpler one
-bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
+bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC,
+ Register *NewVCC) const {
unsigned Opc = MI.getOpcode();
if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
return false;
+ if (Opc == AMDGPU::V_CNDMASK_B32_e64) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ auto Reg = MI.getOperand(5).getReg();
+
+ if (*RegVCC != Reg) {
+ MachineInstr *DefMI = MRI->getVRegDef(Reg);
+ if (DefMI) {
+ unsigned Opcode = getInverseCompareOpcode(*DefMI);
+ if (Opcode &&
+ SIFoldOperandsImpl::shouldSwitchOperands(*MRI, MI, *TII)) {
+ auto cmpDL = DefMI->getDebugLoc();
+ *NewVCC = MRI->createVirtualRegister(MRI->getRegClass(Reg));
+ *RegVCC = Reg;
+ MachineInstrBuilder inverseCompare = BuildMI(
+ *DefMI->getParent(), DefMI, cmpDL, TII->get(Opcode), *NewVCC);
+
+ inverseCompare.add(DefMI->getOperand(1));
+ inverseCompare.add(DefMI->getOperand(2));
+ }
+ }
+ }
+ if (*RegVCC == Reg) {
+ BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64),
+ MI.getOperand(0).getReg())
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .addReg(*NewVCC);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
@@ -2533,10 +2616,12 @@ bool SIFoldOperandsImpl::run(MachineFunction &MF) {
bool HasNSZ = MFI->hasNoSignedZerosFPMath();
bool Changed = false;
+ Register Reg = 0;
+ Register newVCC = 0;
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineOperand *CurrentKnownM0Val = nullptr;
for (auto &MI : make_early_inc_range(*MBB)) {
- Changed |= tryFoldCndMask(MI);
+ Changed |= tryFoldCndMask(MI, &Reg, &newVCC);
if (tryFoldZeroHighBits(MI)) {
Changed = true;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index d9158e3558395..536504747c971 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -2835,9 +2835,9 @@ define i48 @v_uaddsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_i48:
@@ -2944,10 +2944,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
@@ -3003,10 +3003,10 @@ define amdgpu_ps <2 x float> @uaddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, -1, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX6-NEXT: ; return to shader part epilog
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index 1fd139b06417f..1944d1577ae29 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -2705,9 +2705,9 @@ define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_i48:
@@ -2815,9 +2815,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_sv:
@@ -2873,9 +2873,9 @@ define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: usubsat_i48_vs:
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 06c0417211809..efd633d21dba1 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1287,11 +1287,11 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0
; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2
; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20
-; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v8, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v9, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v12, 0, v10, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v13, 0, v11, vcc
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
@@ -3414,11 +3414,11 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12
; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14
; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18
-; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
-; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT: v_cndmask_b32_e32 v10, 0, v0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v8, 0, v2, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v9, 0, v3, vcc
; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 77b78f1f8a333..07d7276e3b944 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -495,13 +495,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
; GISEL-NEXT: v_and_b32_e32 v9, 1, v9
; GISEL-NEXT: v_and_b32_e32 v8, 1, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v22, v18, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v18, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v20, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v21, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v21, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v23, v19, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v19, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB0_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -685,12 +685,12 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v6, vcc
; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v13, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -1251,13 +1251,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
; GISEL-NEXT: v_and_b32_e32 v3, 1, v3
; GISEL-NEXT: v_and_b32_e32 v2, 1, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v1, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB1_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -1423,12 +1423,12 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v4, vcc
; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2093,13 +2093,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
; GISEL-NEXT: v_and_b32_e32 v19, 1, v19
; GISEL-NEXT: v_and_b32_e32 v18, 1, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v16, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v9, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v17, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB2_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -2283,12 +2283,12 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v3, v20, v21
; GISEL-NEXT: v_and_b32_e32 v20, 1, v3
; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v12, vcc
; GISEL-NEXT: v_and_b32_e32 v22, 1, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
@@ -2920,13 +2920,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
; GISEL-NEXT: v_and_b32_e32 v21, 1, v21
; GISEL-NEXT: v_and_b32_e32 v20, 1, v20
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v21
-; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v21
+; GISEL-NEXT: v_cndmask_b32_e32 v32, 0, v0, vcc
; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v20
-; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v3, vcc
; GISEL-NEXT: s_xor_b64 s[4:5], s[4:5], -1
-; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v33, 0, v1, vcc
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
; GISEL-NEXT: s_cbranch_execz .LBB3_6
; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
@@ -3092,12 +3092,12 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
; GISEL-NEXT: v_or_b32_e32 v19, v26, v24
; GISEL-NEXT: v_and_b32_e32 v24, 1, v19
; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
-; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v4, vcc
; GISEL-NEXT: v_and_b32_e32 v26, 1, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v18, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, 0, v7, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index 10de973dac0c5..cd1426f868bce 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1282,10 +1282,10 @@ define double @double16_extelt_vec(i32 %sel) {
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999
-; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
entry:
%ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 14f7cbcd0f438..1b471166b5d29 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -2836,9 +2836,9 @@ define float @v_fneg_select_infloop_regression_f32(float %a...
[truncated]
|
Normally this kind of shrinking would be done in SIShrinkInstructions. Is there a reason you can't do it there? |
bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI, Register *RegVCC, | ||
Register *NewVCC) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand how the RegVCC
and NewVCC
arguments are used. Please add a comment to explain.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
used to remember vcc and new inverted vcc so that shouldSwitchOperands check is called only once.
bool SIFoldOperandsImpl::shouldSwitchOperands(MachineRegisterInfo &MRI, | ||
MachineInstr &MI, | ||
const SIInstrInfo &TII) const { | ||
auto allUses = MRI.use_nodbg_operands(MI.getOperand(5).getReg()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All variable names should start with an upper case letter.
if (src1Imm && !src0Imm) | ||
count++; | ||
} | ||
return (count >= 2); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is probably still worth doing this if count == 1, to reduce code size.
return AMDGPU::V_CMP_LE_U32_e64; | ||
case AMDGPU::V_CMP_LT_U32_e64: | ||
return AMDGPU::V_CMP_GE_U32_e64; | ||
default: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also handle floating point comparisons?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes
I wasn't sure if that would be proper because I am changing not only v_cndmask but also cmp instructions, but correct me if i'm wrong |
if (src1Imm && !src0Imm) | ||
count++; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If src0 has source modifiers then swapping operands will not help with turning this instruction into vop2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should that be added to check ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes
I think it is OK to do it in SIShrinkInstructions. It does not have stricter rules than SIFoldOperands. |
okay, will try and migrate that there |
I submitted this as a separate PR because I found it cleaner : #135162 |
It kind of does, it needs to support SSA and non-SSA |
Moved to #135162 |
Switch operands in v_cndmask x, y, where y constant, for using vop2 instead of vop3 format (which also requires inverting the comparison). This allows for later merging of these instructions into v_dual_cndmask.