-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] combineEXTRACT_SUBVECTOR - generalize extract_subvector(broadcast(x),c) fold with IsElementEquivalent #141963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…st(x),c) fold with IsElementEquivalent Instead of matching the broadcast nodes directly, let IsElementEquivalent handle it to allow BITCAST handling This resulted in a much simpler implementation than adding BITCAST matching, which we already have with IsElementEquivalent
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesInstead of matching the broadcast nodes directly, let IsElementEquivalent handle it to allow BITCAST handling, which we already have with IsElementEquivalent Full diff: https://github.com/llvm/llvm-project/pull/141963.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6b71f49165c60..be1939eb95fa1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59549,6 +59549,7 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
unsigned SizeInBits = VT.getSizeInBits();
unsigned InSizeInBits = InVecVT.getSizeInBits();
unsigned NumSubElts = VT.getVectorNumElements();
+ unsigned NumInElts = InVecVT.getVectorNumElements();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(N);
@@ -59615,22 +59616,22 @@ static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
}
}
- // If we're extracting an upper subvector from a broadcast we should just
- // extract the lowest subvector instead which should allow
+ // If we're extracting an upper subvector see if we'd get the same elements if
+ // we extracted the lowest subvector instead which should allow
// SimplifyDemandedVectorElts do more simplifications.
- if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
- InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||
- DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
- return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ if (IdxVal != 0) {
+ bool AllEquiv = all_of(seq<unsigned>(NumSubElts), [&](unsigned I) {
+ return IsElementEquivalent(NumInElts, InVec, InVec, I, I + IdxVal);
+ });
+ if (AllEquiv)
+ return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
+ }
// Check if we're extracting a whole broadcasted subvector.
if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
auto *MemIntr = cast<MemIntrinsicSDNode>(InVec);
EVT MemVT = MemIntr->getMemoryVT();
if (MemVT == VT) {
- // Just use the lowest subvector.
- if (IdxVal != 0)
- return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
// If this is the only use, we can replace with a regular load (this may
// have been missed by SimplifyDemandedVectorElts due to extra uses of the
// memory chain).
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index d6e6ad184e03f..f2e4da0ac5400 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -3951,7 +3951,6 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX512F-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -3962,7 +3961,6 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in
; AVX512DQ-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4004,7 +4002,7 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
;
; AVX-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastss (%rdi), %ymm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,0,0,0]
; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
@@ -4013,7 +4011,6 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
@@ -4029,7 +4026,6 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
; AVX512F-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4040,7 +4036,6 @@ define void @vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16(ptr %in
; AVX512DQ-LABEL: vec512_i8_widen_to_i32_factor4_broadcast_to_v16i32_factor16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4082,7 +4077,7 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
;
; AVX-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
; AVX: # %bb.0:
-; AVX-NEXT: vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,1,0,1]
; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1
; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3
@@ -4091,7 +4086,6 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
; AVX-NEXT: vmovdqa %xmm3, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
@@ -4107,7 +4101,6 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
; AVX512F-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4118,7 +4111,6 @@ define void @vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8(ptr %in.e
; AVX512DQ-LABEL: vec512_i8_widen_to_i64_factor8_broadcast_to_v8i64_factor8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4184,7 +4176,6 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
; AVX512F-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4195,7 +4186,6 @@ define void @vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4(ptr %i
; AVX512DQ-LABEL: vec512_i8_widen_to_i128_factor16_broadcast_to_v4i128_factor4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastb (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4338,7 +4328,6 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
; AVX512F-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4349,7 +4338,6 @@ define void @vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16(ptr %i
; AVX512DQ-LABEL: vec512_i16_widen_to_i32_factor2_broadcast_to_v16i32_factor16:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4418,7 +4406,6 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
; AVX512F-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4429,7 +4416,6 @@ define void @vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8(ptr %in.
; AVX512DQ-LABEL: vec512_i16_widen_to_i64_factor4_broadcast_to_v8i64_factor8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4497,7 +4483,6 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
; AVX512F-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4508,7 +4493,6 @@ define void @vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4(ptr %i
; AVX512DQ-LABEL: vec512_i16_widen_to_i128_factor8_broadcast_to_v4i128_factor4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4654,7 +4638,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
;
; AVX512F-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0
+; AVX512F-NEXT: vpbroadcastd (%rdi), %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4664,7 +4648,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in.
;
; AVX512DQ-LABEL: vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0
+; AVX512DQ-NEXT: vpbroadcastd (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4731,7 +4715,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i
;
; AVX512F-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd (%rdi), %zmm0
+; AVX512F-NEXT: vpbroadcastd (%rdi), %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4741,7 +4725,7 @@ define void @vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4(ptr %i
;
; AVX512DQ-LABEL: vec512_i32_widen_to_i128_factor4_broadcast_to_v4i128_factor4:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastd (%rdi), %zmm0
+; AVX512DQ-NEXT: vpbroadcastd (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4886,7 +4870,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
;
; AVX512F-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq (%rdi), %zmm0
+; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
@@ -4896,7 +4880,7 @@ define void @vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4(ptr %i
;
; AVX512DQ-LABEL: vec512_i64_widen_to_i128_factor2_broadcast_to_v4i128_factor4:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastq (%rdi), %zmm0
+; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm1
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll
index 40ad731bff85c..fcde7ffe30ff2 100644
--- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll
+++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll
@@ -134,9 +134,8 @@ define <16 x i32> @test_broadcast_4i32_16i32(ptr%p) nounwind {
define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
@@ -149,9 +148,8 @@ define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind {
;
; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQVL-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
@@ -164,9 +162,8 @@ define <32 x i16> @test_broadcast_8i16_32i16(ptr%p) nounwind {
define <64 x i8> @test_broadcast_16i8_64i8(ptr%p) nounwind {
; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512VL: ## %bb.0:
-; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; X64-AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512VL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512VL-NEXT: retq
@@ -179,9 +176,8 @@ define <64 x i8> @test_broadcast_16i8_64i8(ptr%p) nounwind {
;
; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
; X64-AVX512DQVL: ## %bb.0:
-; X64-AVX512DQVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; X64-AVX512DQVL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-AVX512DQVL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
-; X64-AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; X64-AVX512DQVL-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-AVX512DQVL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; X64-AVX512DQVL-NEXT: retq
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
…st(x),c) fold with IsElementEquivalent (llvm#141963) Instead of matching the broadcast nodes directly, let IsElementEquivalent handle it to allow BITCAST handling, which we already have with IsElementEquivalent
Instead of matching the broadcast nodes directly, let IsElementEquivalent handle it to allow BITCAST handling, which we already have with IsElementEquivalent