Skip to content

Commit 7647f47

Browse files
authored
[X86] isShuffleFoldableLoad - only check that the SDValue has one use (#126900)
We don't need the entire load node to have oneuse, just the loaded value - prevents load chains from interfering with shuffle commutation
1 parent b101c35 commit 7647f47

6 files changed

+106
-132
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12480,7 +12480,7 @@ static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
1248012480
/// This is particularly important because the set of instructions varies
1248112481
/// significantly based on whether the operand is a load or not.
1248212482
static bool isShuffleFoldableLoad(SDValue V) {
12483-
return V->hasOneUse() &&
12483+
return V.hasOneUse() &&
1248412484
ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
1248512485
}
1248612486

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,10 +1665,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
16651665
;
16661666
; AVX512F-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
16671667
; AVX512F-FAST: # %bb.0:
1668-
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
1669-
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
1670-
; AVX512F-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
1671-
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
1668+
; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
1669+
; AVX512F-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
1670+
; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
16721671
; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
16731672
; AVX512F-FAST-NEXT: vzeroupper
16741673
; AVX512F-FAST-NEXT: retq
@@ -1684,10 +1683,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
16841683
;
16851684
; AVX512DQ-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
16861685
; AVX512DQ-FAST: # %bb.0:
1687-
; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
1688-
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
1689-
; AVX512DQ-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
1690-
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm0
1686+
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
1687+
; AVX512DQ-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
1688+
; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
16911689
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rdx)
16921690
; AVX512DQ-FAST-NEXT: vzeroupper
16931691
; AVX512DQ-FAST-NEXT: retq
@@ -1703,10 +1701,9 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
17031701
;
17041702
; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
17051703
; AVX512BW-FAST: # %bb.0:
1706-
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
1707-
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,1,2,3,8,5,6,7]
1708-
; AVX512BW-FAST-NEXT: vpermi2d (%rdi), %ymm0, %ymm1
1709-
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm1, %zmm0
1704+
; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,9,10,11,0,13,14,15]
1705+
; AVX512BW-FAST-NEXT: vpermd (%rdi), %zmm0, %zmm0
1706+
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
17101707
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
17111708
; AVX512BW-FAST-NEXT: vzeroupper
17121709
; AVX512BW-FAST-NEXT: retq

llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -201,8 +201,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
201201
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
202202
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
203203
; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
204-
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
205-
; AVX512-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
204+
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
205+
; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
206206
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
207207
; AVX512-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
208208
; AVX512-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
@@ -260,8 +260,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
260260
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
261261
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
262262
; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
263-
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
264-
; AVX512DQ-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
263+
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
264+
; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
265265
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
266266
; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
267267
; AVX512DQ-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
@@ -319,8 +319,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
319319
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
320320
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
321321
; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
322-
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
323-
; AVX512BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
322+
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
323+
; AVX512BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
324324
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
325325
; AVX512BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
326326
; AVX512BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
@@ -378,8 +378,8 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
378378
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
379379
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
380380
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm3
381-
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,5,0,0]
382-
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm5, %xmm4, %xmm6
381+
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [7,1,0,0]
382+
; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm6
383383
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,2,0,0]
384384
; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
385385
; AVX512DQ-BW-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4

llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -403,13 +403,12 @@ define void @PR39483() {
403403
;
404404
; X86-AVX512-LABEL: PR39483:
405405
; X86-AVX512: # %bb.0: # %entry
406-
; X86-AVX512-NEXT: vmovups 0, %zmm0
407-
; X86-AVX512-NEXT: vmovups 64, %ymm1
408-
; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
409-
; X86-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
410-
; X86-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
411-
; X86-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
412-
; X86-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
406+
; X86-AVX512-NEXT: vmovups 64, %ymm0
407+
; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7]
408+
; X86-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0
409+
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
410+
; X86-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
411+
; X86-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
413412
; X86-AVX512-NEXT: vmovups %ymm0, (%eax)
414413
;
415414
; X64-AVX1-LABEL: PR39483:
@@ -444,13 +443,12 @@ define void @PR39483() {
444443
;
445444
; X64-AVX512-LABEL: PR39483:
446445
; X64-AVX512: # %bb.0: # %entry
447-
; X64-AVX512-NEXT: vmovups 0, %zmm0
448-
; X64-AVX512-NEXT: vmovups 64, %ymm1
449-
; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,8,11,14,17,20,23]
450-
; X64-AVX512-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
451-
; X64-AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
452-
; X64-AVX512-NEXT: vmulps %ymm0, %ymm2, %ymm1
453-
; X64-AVX512-NEXT: vaddps %ymm0, %ymm1, %ymm0
446+
; X64-AVX512-NEXT: vmovups 64, %ymm0
447+
; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [18,21,24,27,30,1,4,7]
448+
; X64-AVX512-NEXT: vpermt2ps 0, %zmm1, %zmm0
449+
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
450+
; X64-AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0
451+
; X64-AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0
454452
; X64-AVX512-NEXT: vmovups %ymm0, (%rax)
455453
entry:
456454
%wide.vec = load <24 x float>, ptr null, align 4

llvm/test/CodeGen/X86/vselect-avx.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -377,14 +377,14 @@ define void @vselect_concat_splat() {
377377
; AVX512-NEXT: vmovaps %ymm2, %ymm3
378378
; AVX512-NEXT: vpermi2ps %ymm1, %ymm0, %ymm3
379379
; AVX512-NEXT: vmovups 32, %xmm4
380-
; AVX512-NEXT: vmovups 0, %ymm5
381-
; AVX512-NEXT: vxorps %xmm6, %xmm6, %xmm6
382-
; AVX512-NEXT: vcmpneqps %xmm6, %xmm3, %k0
380+
; AVX512-NEXT: vxorps %xmm5, %xmm5, %xmm5
381+
; AVX512-NEXT: vcmpneqps %xmm5, %xmm3, %k0
383382
; AVX512-NEXT: kshiftlw $4, %k0, %k1
384383
; AVX512-NEXT: korw %k1, %k0, %k1
385-
; AVX512-NEXT: vpermt2ps %ymm4, %ymm2, %ymm5
386384
; AVX512-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
387-
; AVX512-NEXT: vmovaps %ymm5, %ymm0 {%k1}
385+
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,11,14,1,9,12,15,2]
386+
; AVX512-NEXT: vpermi2ps 0, %ymm4, %ymm1
387+
; AVX512-NEXT: vmovaps %ymm1, %ymm0 {%k1}
388388
; AVX512-NEXT: vmovups %ymm0, (%rax)
389389
; AVX512-NEXT: vzeroupper
390390
; AVX512-NEXT: retq

0 commit comments

Comments
 (0)