-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X,C1),C2) --> kshiftr(X,C1+C2) #115528
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesMerge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel. Patch is 45.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115528.diff 9 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19a85a6d7ec6ce..748f885e3f8d90 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58405,11 +58405,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
-
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Fold kshiftr(extract_subvector(X,C1),C2)
+ // --> extract_subvector(kshiftr(X,C1+C2),0)
+ // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
+ if (N->getOpcode() == X86ISD::KSHIFTR) {
+ SDLoc DL(N);
+ if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+ N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
+ SDValue Src = N->getOperand(0).getOperand(0);
+ uint64_t Amt = N->getConstantOperandVal(1) +
+ N->getOperand(0).getConstantOperandVal(1);
+ EVT SrcVT = Src.getValueType();
+ if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
+ SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
+ DAG.getTargetConstant(Amt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+ }
+
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
index 537f42dd9c2c59..e0f3b6c4ec90a4 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
@@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0)
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index bd52b9cd41584c..f6e5986afac531 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kshiftrd $8, %k1, %k2
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; SKX-NEXT: kshiftrd $16, %k1, %k1
-; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: kshiftrd $16, %k1, %k2
+; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; SKX-NEXT: kshiftrd $24, %k1, %k1
; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; SKX-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 83d3a33572266f..c879cb9867ab29 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL-NEXT: fldz
; KNL-NEXT: fld %st(0)
; KNL-NEXT: fcmovne %st(2), %st
-; KNL-NEXT: testb $2, %al
-; KNL-NEXT: fld %st(1)
-; KNL-NEXT: fcmovne %st(3), %st
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
+; KNL-NEXT: fld %st(1)
+; KNL-NEXT: fcmovne %st(3), %st
+; KNL-NEXT: testb $2, %al
; KNL-NEXT: fld %st(2)
; KNL-NEXT: fcmovne %st(4), %st
-; KNL-NEXT: testb $2, %al
+; KNL-NEXT: testb $8, %al
; KNL-NEXT: fxch %st(3)
; KNL-NEXT: fcmovne %st(4), %st
; KNL-NEXT: fstp %st(4)
; KNL-NEXT: fxch %st(3)
+; KNL-NEXT: fstpt 30(%rdi)
+; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt 10(%rdi)
; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt (%rdi)
-; KNL-NEXT: fxch %st(1)
-; KNL-NEXT: fstpt 30(%rdi)
; KNL-NEXT: fstpt 20(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
-; SKX-NEXT: testb $2, %al
-; SKX-NEXT: fld %st(1)
-; SKX-NEXT: fcmovne %st(3), %st
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
+; SKX-NEXT: fld %st(1)
+; SKX-NEXT: fcmovne %st(3), %st
+; SKX-NEXT: testb $2, %al
; SKX-NEXT: fld %st(2)
; SKX-NEXT: fcmovne %st(4), %st
-; SKX-NEXT: testb $2, %al
+; SKX-NEXT: testb $8, %al
; SKX-NEXT: fxch %st(3)
; SKX-NEXT: fcmovne %st(4), %st
; SKX-NEXT: fstp %st(4)
; SKX-NEXT: fxch %st(3)
+; SKX-NEXT: fstpt 30(%rdi)
+; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt 10(%rdi)
; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt (%rdi)
-; SKX-NEXT: fxch %st(1)
-; SKX-NEXT: fstpt 30(%rdi)
; SKX-NEXT: fstpt 20(%rdi)
; SKX-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 29922c2ac1a716..5b2431eb214955 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $8, %al
; AVX512VL-NEXT: fld1
; AVX512VL-NEXT: fldz
; AVX512VL-NEXT: fld %st(0)
; AVX512VL-NEXT: fcmovne %st(2), %st
-; AVX512VL-NEXT: testb $1, %al
+; AVX512VL-NEXT: testb $2, %al
; AVX512VL-NEXT: fld %st(1)
; AVX512VL-NEXT: fcmovne %st(3), %st
-; AVX512VL-NEXT: kmovd %k1, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld %st(2)
; AVX512VL-NEXT: fcmovne %st(4), %st
+; AVX512VL-NEXT: kmovd %k1, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fxch %st(3)
; AVX512VL-NEXT: fcmovne %st(4), %st
@@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: fstpt 10(%rdi)
; AVX512VL-NEXT: fxch %st(1)
; AVX512VL-NEXT: fadd %st, %st(0)
+; AVX512VL-NEXT: fstpt 60(%rdi)
+; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 20(%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt (%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
-; AVX512VL-NEXT: fstpt 60(%rdi)
-; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 40(%rdi)
%1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
%2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 22b5246443fa8a..7e081310c35be5 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 4d7d2573183e07..68c6ca93576b76 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index f8c076db65de94..17b98b5ebcaeae 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VL-NEXT: kshiftrq $48, %k1, %k3
; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
-; AVX512VL-NEXT: kshiftrd $16, %k4, %k3
-; AVX512VL-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VL-NEXT: kshiftrq $16, %k1, %k2
; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k1, %k0
+; AVX512VL-NEXT: kshiftrq $8, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k1, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
; AVX512VL-NEXT: kxorw %k5, %k0, %k0
@@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k4, %k0
+; AVX512VL-NEXT: kshiftrq $40, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k4, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
; AVX512VL-NEXT: kxorw %k4, %k0, %k0
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 85e782e9083492..36a902637272d7 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX...
[truncated]
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | ||
// Fold kshiftr(extract_subvector(X,C1),C2) | ||
// --> extract_subvector(kshiftr(X,C1+C2),0) | ||
// Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this also apply to kshiftl?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(kshiftl (kshiftl x, c1), c2)
might be useful - not sure if we need to support kshiftl+insert_subvector/concat_subvectors
patterns or not though.
I think parallel is not much important for mask operations, becasue they mainly serve for vector instructions. That says, the contiguous mask operations would not be a bottleneck in real workload. |
@phoebewang I'd still like get this in, do you think this will just be neutral or actually cause regressions? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@phoebewang I'd still like get this in, do you think this will just be neutral or actually cause regressions?
I'm neutral to the patch. I don't test the performance, but assume the effect is minor. So land as you like :)
…) --> kshiftr(X,C1+C2) Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
cc543bb
to
24e36b2
Compare
Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.