-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] Distribute Certain Bitwise Operations over SELECT #136555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Stacked on top of #136554. |
c6cb302
to
e1569b4
Compare
e1569b4
to
b802d18
Compare
b802d18
to
f06d644
Compare
@llvm/pr-subscribers-backend-x86 Author: Marius Kamp (mskamp) ChangesInstCombine canonicalizes Trigger the inverse transformation in the X86 backend if BMI is Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi Fixes #131587, fixes #133848. Patch is 24.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/136555.diff 2 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6b71f49165c60..e7dcf4a91e8fc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,6 +28,7 @@
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -35552,8 +35553,24 @@ bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(
unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
SDValue Y) const {
- if (SelectOpcode != ISD::VSELECT)
+ if (SelectOpcode == ISD::SELECT) {
+ if (VT.isVector())
+ return false;
+ if (!Subtarget.hasBMI() || (VT != MVT::i32 && VT != MVT::i64))
+ return false;
+ using namespace llvm::SDPatternMatch;
+ // BLSI
+ if (BinOpcode == ISD::AND && sd_match(Y, m_Neg(m_Specific(X))))
+ return true;
+ // BLSR
+ if (BinOpcode == ISD::AND && sd_match(Y, m_Add(m_Specific(X), m_AllOnes())))
+ return true;
+ // BLSMSK
+ if (BinOpcode == ISD::XOR && sd_match(Y, m_Add(m_Specific(X), m_AllOnes())))
+ return true;
+
return false;
+ }
// TODO: This is too general. There are cases where pre-AVX512 codegen would
// benefit. The transform may also be profitable for scalar code.
if (!Subtarget.hasAVX512())
diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
new file mode 100644
index 0000000000000..466f877f57600
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -0,0 +1,778 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
+
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi1:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi1:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi2:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi2:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi3:
+; X86: # %bb.0:
+; X86-NEXT: blsil %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi3:
+; X64: # %bb.0:
+; X64-NEXT: blsil %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 -1, i32 %sub
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_neg_to_blsi_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: sbbl %esi, %edx
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_to_blsi_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsiq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = sub i64 0, %a1
+ %bls = select i1 %a0, i64 %sub, i64 -1
+ %ret = and i64 %a1, %bls
+ ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_neg_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_neg_i16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: negl %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_i16:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: negl %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %sub = sub i16 0, %a1
+ %bls = select i1 %a0, i16 %sub, i16 -1
+ %ret = and i16 %a1, %bls
+ ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_neg_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_neg_v4xi32:
+; X86: # %bb.0:
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: jne .LBB5_1
+; X86-NEXT: # %bb.2:
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+; X86-NEXT: .LBB5_1:
+; X86-NEXT: pxor %xmm1, %xmm1
+; X86-NEXT: psubd %xmm0, %xmm1
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_v4xi32:
+; X64: # %bb.0:
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: jne .LBB5_1
+; X64-NEXT: # %bb.2:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+; X64-NEXT: .LBB5_1:
+; X64-NEXT: pxor %xmm1, %xmm1
+; X64-NEXT: psubd %xmm0, %xmm1
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %sub = sub <4 x i32> zeroinitializer, %a1
+ %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ %ret = and <4 x i32> %a1, %bls
+ ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_neg(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_neg:
+; X86: # %bb.0:
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_no_neg:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %esi, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 0
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_neg_wrong_const:
+; X86: # %bb.0:
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: negl %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_wrong_const:
+; X64: # %bb.0:
+; X64-NEXT: movl %esi, %ecx
+; X64-NEXT: negl %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a1
+ %bls = select i1 %a0, i32 %sub, i32 1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_neg_different_op:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: negl %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_neg_different_op:
+; X64: # %bb.0:
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: negl %edx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %edx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 0, %a2
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr1:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr1:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr2:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr2:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr3:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr3:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 -1, i32 %sub
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr4:
+; X86: # %bb.0:
+; X86-NEXT: blsrl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr4:
+; X64: # %bb.0:
+; X64-NEXT: blsrl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_to_blsr_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_to_blsr_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsrq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = add i64 %a1, -1
+ %bls = select i1 %a0, i64 %sub, i64 -1
+ %ret = and i64 %a1, %bls
+ ret i64 %ret
+}
+
+; Negative test
+define i16 @and_select_sub_1_i16(i1 %a0, i16 %a1) nounwind {
+; X86-LABEL: and_select_sub_1_i16:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: leal -1(%edx), %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %eax, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_i16:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -1(%rsi), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: retq
+ %sub = add i16 %a1, -1
+ %bls = select i1 %a0, i16 %sub, i16 -1
+ %ret = and i16 %a1, %bls
+ ret i16 %ret
+}
+
+; Negative test
+define <4 x i32> @and_select_sub_1_v4xi32(i1 %a0, <4 x i32> %a1) nounwind {
+; X86-LABEL: and_select_sub_1_v4xi32:
+; X86: # %bb.0:
+; X86-NEXT: pcmpeqd %xmm1, %xmm1
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: je .LBB15_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: paddd %xmm0, %xmm1
+; X86-NEXT: .LBB15_2:
+; X86-NEXT: pand %xmm1, %xmm0
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_v4xi32:
+; X64: # %bb.0:
+; X64-NEXT: pcmpeqd %xmm1, %xmm1
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: je .LBB15_2
+; X64-NEXT: # %bb.1:
+; X64-NEXT: paddd %xmm0, %xmm1
+; X64-NEXT: .LBB15_2:
+; X64-NEXT: pand %xmm1, %xmm0
+; X64-NEXT: retq
+ %sub = add <4 x i32> %a1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %bls = select i1 %a0, <4 x i32> %sub, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+ %ret = and <4 x i32> %a1, %bls
+ ret <4 x i32> %ret
+}
+
+; Negative test
+define i32 @and_select_no_sub_1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_no_sub_1:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: leal -2(%eax), %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_no_sub_1:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -2(%rsi), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -2
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_wrong_const(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: and_select_sub_1_wrong_const:
+; X86: # %bb.0:
+; X86-NEXT: leal -1(%eax), %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: cmovnel %ecx, %edx
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_wrong_const:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $esi killed $esi def $rsi
+; X64-NEXT: leal -1(%rsi), %ecx
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: movl $1, %eax
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+; Negative test
+define i32 @and_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) nounwind {
+; X86-LABEL: and_select_sub_1_different_op:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: andb $1, %cl
+; X86-NEXT: decl %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: cmpb $1, %cl
+; X86-NEXT: sbbl %esi, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: and_select_sub_1_different_op:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $edx killed $edx def $rdx
+; X64-NEXT: andb $1, %dil
+; X64-NEXT: leal -1(%rdx), %ecx
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: cmpb $1, %dil
+; X64-NEXT: sbbl %eax, %eax
+; X64-NEXT: orl %ecx, %eax
+; X64-NEXT: andl %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a2, -1
+ %bls = select i1 %a0, i32 %sub, i32 -1
+ %ret = and i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk1(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk1:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk1:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk2(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk2:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk2:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %bls, %a1
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk3(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk3:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk3:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovnel %esi, %eax
+; X64-NEXT: retq
+ %sub = add i32 %a1, -1
+ %bls = select i1 %a0, i32 0, i32 %sub
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i32 @xor_select_sub_1_to_blsmsk4(i1 %a0, i32 inreg %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk4:
+; X86: # %bb.0:
+; X86-NEXT: blsmskl %eax, %ecx
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk4:
+; X64: # %bb.0:
+; X64-NEXT: blsmskl %esi, %eax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmovel %esi, %eax
+; X64-NEXT: retq
+ %sub = sub i32 %a1, 1
+ %bls = select i1 %a0, i32 %sub, i32 0
+ %ret = xor i32 %a1, %bls
+ ret i32 %ret
+}
+
+define i64 @xor_select_sub_1_to_blsmsk_i64(i1 %a0, i64 %a1) nounwind {
+; X86-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X86: # %bb.0:
+; X86-NEXT: pushl %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: addl $-1, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: xorl %ecx, %eax
+; X86-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovel %ecx, %eax
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: popl %esi
+; X86-NEXT: retl
+;
+; X64-LABEL: xor_select_sub_1_to_blsmsk_i64:
+; X64: # %bb.0:
+; X64-NEXT: blsmskq %rsi, %rax
+; X64-NEXT: testb $1, %dil
+; X64-NEXT: cmoveq %rsi, %rax
+; X64-NEXT: retq
+ %sub = add i64 %a1, -1
+ %bls = select i1 %a0, i64 %sub,...
[truncated]
|
return false; | ||
using namespace llvm::SDPatternMatch; | ||
// BLSI | ||
if (BinOpcode == ISD::AND && sd_match(Y, m_Neg(m_Specific(X)))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can't sd_match(X, m_Neg(m_Specific(Y)))
occur as well?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, and I've added this condition now. In this case, however, we still do not emit a BLSI instruction because the negation is used more than once after the transformation (in the other operand of the select). Codegen seems to improve slightly, though. I'm not sure if it's worth it in general. So I'm quite indifferent whether we keep this additional condition or remove it again.
f06d644
to
aac70ed
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with one minor
InstCombine canonicalizes `(select P (and X (- X)) X)` to `(and (select P (- X) umax) X)`. This is counterproductive for the X86 backend when BMI is available because we can encode `(and X (- X))` using the `BLSI` instruction. A similar situation arises if we have `(select P (and X (sub X 1)) X)` (prevents use of `BLSR` instruction) or `(select P (xor X (sub X 1)) X)` (prevents use of `BLSMSK` instruction). Trigger the inverse transformation in the X86 backend if BMI is available and we can use the mentioned BMI instructions. This is done by adjusting the `shouldFoldSelectWithIdentityConstant()` implementation for the X86 backend. In this way, we get `(select P (and X (- X)) X)` again, which enables the use of `BLSI` (similar for the other cases described above). Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi Fixes llvm#131587, fixes llvm#133848.
aac70ed
to
4ced124
Compare
InstCombine canonicalizes
(select P (and X (- X)) X)
to(and (select P (- X) umax) X)
. This is counterproductive for the X86backend when BMI is available because we can encode
(and X (- X))
using the
BLSI
instruction. A similar situation arises if we have(select P (and X (sub X 1)) X)
(prevents use ofBLSR
instruction) or(select P (xor X (sub X 1)) X)
(prevents use ofBLSMSK
instruction).Trigger the inverse transformation in the X86 backend if BMI is
available and we can use the mentioned BMI instructions. This is done by
overriding the appropriate
shouldFoldSelectWithIdentityConstant()
overload. In this way, we get
(select P (and X (- X)) X)
again, whichenables the use of
BLSI
(similar for the other cases described above).Alive proofs: https://alive2.llvm.org/ce/z/MT_pKi
Fixes #131587, fixes #133848.