-
Notifications
You must be signed in to change notification settings - Fork 13.5k
[X86] Convert logicalshift(x, C) -> and(x, M) iff x is allsignbits #83596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf we're just trying to shift the signbit/MSB down to the LSB, and the value is all-signbits, then we can just mask out the LSB (as it will already match the signbit). This helps removes some unnecessary bitcasted vXi16 shifts used for vXi8 shifts (which SimplifyDemandedBits will struggle to remove through the bitcast), and allows some AVX1 shifts of 256-bit values to stay as a YMM instruction. Noticed in codegen from #82290 Full diff: https://github.com/llvm/llvm-project/pull/83596.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0162bb65afe3b0..81a6b83a0bccd1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28932,6 +28932,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
+ unsigned EltSizeInBits = VT.getScalarSizeInBits();
auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
@@ -28978,7 +28979,7 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
return SDValue();
// If the shift amount is out of range, return undef.
- if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))
+ if (APIntShiftAmt.uge(EltSizeInBits))
return DAG.getUNDEF(VT);
uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
@@ -29006,6 +29007,12 @@ static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,
Op.getOpcode() == ISD::SRA)
return ArithmeticShiftRight64(ShiftAmt);
+ // If we're right logical shifting the MSB to LSB of an all-signbits value
+ // then we can just perform as a mask.
+ if (Op.getOpcode() == ISD::SRL && ShiftAmt == (EltSizeInBits - 1))
+ if (DAG.ComputeNumSignBits(R) == EltSizeInBits)
+ return DAG.getNode(ISD::AND, dl, VT, R, DAG.getConstant(1, dl, VT));
+
if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
(Subtarget.hasBWI() && VT == MVT::v64i8)) {
unsigned NumElts = VT.getVectorNumElements();
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index ee39b1333fff31..d2794df731b65d 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -180,7 +180,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: psrlw $7, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
@@ -191,7 +190,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
-; SSSE3-NEXT: psrlw $7, %xmm0
; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
@@ -203,7 +201,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -214,7 +211,6 @@ define <16 x i8> @ext_i16_16i8(i16 %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
@@ -268,11 +264,10 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i4_4i64:
@@ -328,11 +323,10 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i32:
@@ -390,11 +384,10 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i16:
@@ -436,14 +429,12 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm0
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqb %xmm2, %xmm1
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm1
; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
; SSE2-SSSE3-NEXT: retq
;
@@ -460,13 +451,9 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i8:
@@ -477,7 +464,6 @@ define <32 x i8> @ext_i32_32i8(i32 %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
@@ -550,19 +536,18 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [16,32,64,128]
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqq %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i64:
@@ -631,19 +616,18 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [256,512,1024,2048,4096,8192,16384,32768]
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i32:
@@ -712,23 +696,22 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i32_32i16:
@@ -782,26 +765,22 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0
; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm0
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm0
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm0
; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,2,3,3,4,5,6,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm1
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm1
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1
; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,5,5]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2
; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm2
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm2
; SSE2-SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,7,7]
; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; SSE2-SSSE3-NEXT: pand %xmm4, %xmm3
; SSE2-SSSE3-NEXT: pcmpeqb %xmm4, %xmm3
-; SSE2-SSSE3-NEXT: psrlw $7, %xmm3
; SSE2-SSSE3-NEXT: pand %xmm5, %xmm3
; SSE2-SSSE3-NEXT: retq
;
@@ -817,26 +796,20 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5]
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5]
; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i64_64i8:
@@ -847,13 +820,11 @@ define <64 x i8> @ext_i64_64i8(i64 %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23]
; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1
; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
index 79d8e4acbba5a5..c27b5b289e1fa9 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll
@@ -185,7 +185,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: psrlw $7, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
@@ -196,7 +195,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; SSSE3-NEXT: pand %xmm1, %xmm0
; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
-; SSSE3-NEXT: psrlw $7, %xmm0
; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
@@ -208,7 +206,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: retq
;
@@ -219,7 +216,6 @@ define <16 x i1> @bitcast_i16_16i1(i16 zeroext %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
@@ -252,13 +248,9 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: bitcast_i32_32i1:
@@ -269,7 +261,6 @@ define <32 x i1> @bitcast_i32_32i1(i32 %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0
; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
|
// If we're right logical shifting the MSB to LSB of an all-signbits value | ||
// then we can just perform as a mask. | ||
if (Op.getOpcode() == ISD::SRL && ShiftAmt == (EltSizeInBits - 1)) | ||
if (DAG.ComputeNumSignBits(R) == EltSizeInBits) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could handle any shift amount, because we are essentially just extracting a mask. How about drop the ShiftAmt
check and return:
DAG.getNode(ISD::AND, ..., DAG.getConstant(APInt::getLowBitsSet(bitwidth, bitwidth-ShiftAmount)))
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also, is there a reason this isn't in DAGComber
. seems pretty general.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In most cases we'd want to avoid the mask load if we can perform an immediate shift.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems to me most of the benefit comes from the improved analysis. (Edit: which I imagine is pretty universal)
Maybe doing it generally here and undoing in X86ISelDAGToDAG
if the constant is single use (although I know we don't do a great job tracking that at the moment) would be ideal?
Shows cases where logical shifts of allsignbits values can be profitably converted to masks
If we're shift an all-signbits value, then we can just mask out the shifted bits. This helps removes some unnecessary bitcasted vXi16 shifts used for vXi8 shifts (which SimplifyDemandedBits will struggle to remove through the bitcast), and allows some AVX1 shifts of 256-bit values to stay as a YMM instruction. Noticed in codegen from llvm#82290
LGTM. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
If we're logical shifting an all-signbits value, then we can just mask out the shifted bits.
This helps removes some unnecessary bitcasted vXi16 shifts used for vXi8 shifts (which SimplifyDemandedBits will struggle to remove through the bitcast), and allows some AVX1 shifts of 256-bit values to stay as a YMM instruction.
Noticed in codegen from #82290