Skip to content

Commit b107dbb

Browse files
authored
[X86] Reuse X86ISD::SUBV_BROADCAST_LOAD for subvector loads across chains (#142381)
Improve handling of folding a (small) vector load that is also loaded as a X86ISD::SUBV_BROADCAST_LOAD node to just (freely) extract the bottom subvector - similar to #139575 we should be checking the SUBV_BROADCAST_LOAD has uses of the loaded value, and not that the out chain isn't empty to ensure its actually used, we must also call makeEquivalentMemoryOrdering to ensure the out chains are correctly merged to handle any aliasing with later load/stores. This PR is a little messy as it has 2 other inter-dependent changes to avoid regressions - now that we're properly merging subvector loads, we can drop the oneuse limit on the "vperm2x128(load(p),undef) -> broadcast128(p+offset)" and "insert_subvector(load256(p),load128(p),0) -> broadcast128(p)" folds.
1 parent b4ded99 commit b107dbb

13 files changed

+3642
-3844
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42672,7 +42672,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4267242672
bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, LHS);
4267342673
bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, LHS);
4267442674
if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() &&
42675-
X86::mayFoldLoad(LHS, Subtarget)) {
42675+
X86::mayFoldLoad(LHS, Subtarget, /*AssumeSingleUse=*/true)) {
4267642676
MVT MemVT = VT.getHalfNumVectorElementsVT();
4267742677
unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
4267842678
return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, VT, MemVT,
@@ -53144,9 +53144,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
5314453144
User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
5314553145
UserLd->getChain() == Chain && UserLd->getBasePtr() == Ptr &&
5314653146
UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits() &&
53147-
!User->hasAnyUseOfValue(1) &&
53147+
User->hasAnyUseOfValue(0) &&
5314853148
User->getValueSizeInBits(0).getFixedValue() >
5314953149
RegVT.getFixedSizeInBits()) {
53150+
DAG.makeEquivalentMemoryOrdering(SDValue(N, 1), SDValue(User, 1));
5315053151
SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, dl,
5315153152
RegVT.getSizeInBits());
5315253153
Extract = DAG.getBitcast(RegVT, Extract);
@@ -59442,10 +59443,8 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5944259443
// If we're splatting the lower half subvector of a full vector load into the
5944359444
// upper half, just splat the subvector directly, potentially creating a
5944459445
// subvector broadcast.
59445-
// TODO: Drop hasOneUse checks.
5944659446
if ((int)IdxVal == (VecNumElts / 2) &&
59447-
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits()) &&
59448-
(Vec.hasOneUse() || SubVec.hasOneUse())) {
59447+
Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
5944959448
auto *VecLd = dyn_cast<LoadSDNode>(Vec);
5945059449
auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
5945159450
if (VecLd && SubLd &&

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3367,7 +3367,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
33673367
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
33683368
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7]
33693369
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
3370-
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1]
3370+
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
33713371
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
33723372
; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2
33733373
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 46 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,33 +1683,32 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
16831683
;
16841684
; AVX1-LABEL: interleave_24i32_in:
16851685
; AVX1: # %bb.0:
1686-
; AVX1-NEXT: vmovupd (%rcx), %ymm0
1687-
; AVX1-NEXT: vmovups (%rdx), %xmm1
1688-
; AVX1-NEXT: vmovups 16(%rdx), %xmm2
1689-
; AVX1-NEXT: vmovups (%rsi), %xmm3
1690-
; AVX1-NEXT: vmovups 16(%rsi), %xmm4
1691-
; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
1692-
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1693-
; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
1694-
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1695-
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,2,3]
1696-
; AVX1-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
1697-
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
1698-
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
1699-
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
1700-
; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1701-
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
1686+
; AVX1-NEXT: vmovups (%rdx), %xmm0
1687+
; AVX1-NEXT: vmovups 16(%rdx), %xmm1
1688+
; AVX1-NEXT: vmovups (%rsi), %xmm2
1689+
; AVX1-NEXT: vmovups 16(%rsi), %xmm3
1690+
; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3],xmm1[3,3]
1691+
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
1692+
; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[0,2]
17021693
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1703-
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm3
1704-
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
1705-
; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7]
1706-
; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = mem[1,0,2,2]
1707-
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1708-
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,1,2,2]
1709-
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
1710-
; AVX1-NEXT: vmovups %ymm0, 32(%rdi)
1711-
; AVX1-NEXT: vmovups %ymm1, (%rdi)
1712-
; AVX1-NEXT: vmovups %ymm2, 64(%rdi)
1694+
; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = mem[0,1,0,1]
1695+
; AVX1-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0,0,3,3]
1696+
; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
1697+
; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm0[1]
1698+
; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2]
1699+
; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
1700+
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1]
1701+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
1702+
; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2
1703+
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
1704+
; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = mem[0,0,3,3,4,4,7,7]
1705+
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,0,2,2]
1706+
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1707+
; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
1708+
; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
1709+
; AVX1-NEXT: vmovups %ymm2, 32(%rdi)
1710+
; AVX1-NEXT: vmovups %ymm0, (%rdi)
1711+
; AVX1-NEXT: vmovups %ymm1, 64(%rdi)
17131712
; AVX1-NEXT: vzeroupper
17141713
; AVX1-NEXT: retq
17151714
;
@@ -1804,30 +1803,29 @@ define void @interleave_24i32_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
18041803
; XOP-NEXT: vmovups (%rsi), %ymm0
18051804
; XOP-NEXT: vmovups (%rdx), %ymm1
18061805
; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm0[u,3],ymm1[3],ymm0[u,4],ymm1[4],ymm0[u,5]
1807-
; XOP-NEXT: vmovups (%rcx), %ymm1
1808-
; XOP-NEXT: vmovups (%rdx), %xmm2
1809-
; XOP-NEXT: vmovups 16(%rdx), %xmm3
1810-
; XOP-NEXT: vmovups (%rsi), %xmm4
1811-
; XOP-NEXT: vmovups 16(%rsi), %xmm5
1812-
; XOP-NEXT: vshufps {{.*#+}} xmm6 = xmm5[3,3],xmm3[3,3]
1813-
; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
1814-
; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1],xmm5[0,2]
1815-
; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
1816-
; XOP-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,2,3]
1817-
; XOP-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0,0,3,3]
1818-
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
1819-
; XOP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm4[1],xmm2[1]
1820-
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2]
1821-
; XOP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm4[0]
1822-
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1]
1806+
; XOP-NEXT: vmovups (%rdx), %xmm1
1807+
; XOP-NEXT: vmovups 16(%rdx), %xmm2
1808+
; XOP-NEXT: vmovups (%rsi), %xmm3
1809+
; XOP-NEXT: vmovups 16(%rsi), %xmm4
1810+
; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,3],xmm2[3,3]
1811+
; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
1812+
; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1],xmm4[0,2]
18231813
; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
1824-
; XOP-NEXT: vbroadcastsd (%rcx), %ymm4
1825-
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7]
1826-
; XOP-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1,1,2,2]
1827-
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
1814+
; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = mem[0,1,0,1]
1815+
; XOP-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0,0,3,3]
1816+
; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
1817+
; XOP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm3[1],xmm1[1]
1818+
; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2]
1819+
; XOP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
1820+
; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1]
1821+
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
1822+
; XOP-NEXT: vbroadcastsd (%rcx), %ymm3
1823+
; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7]
1824+
; XOP-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2]
1825+
; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
18281826
; XOP-NEXT: vmovups %ymm0, 32(%rdi)
1829-
; XOP-NEXT: vmovups %ymm2, (%rdi)
1830-
; XOP-NEXT: vmovups %ymm3, 64(%rdi)
1827+
; XOP-NEXT: vmovups %ymm1, (%rdi)
1828+
; XOP-NEXT: vmovups %ymm2, 64(%rdi)
18311829
; XOP-NEXT: vzeroupper
18321830
; XOP-NEXT: retq
18331831
%s1 = load <8 x i32>, ptr %q1, align 4

llvm/test/CodeGen/X86/vector-interleave.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -576,12 +576,12 @@ define void @splat2_i64(ptr %s, ptr %d) {
576576
;
577577
; AVX1-LABEL: splat2_i64:
578578
; AVX1: # %bb.0:
579-
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
579+
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1]
580580
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
581-
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
581+
; AVX1-NEXT: vbroadcastf128 16(%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
582582
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
583-
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
584-
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
583+
; AVX1-NEXT: vmovupd %ymm1, 32(%rsi)
584+
; AVX1-NEXT: vmovupd %ymm0, (%rsi)
585585
; AVX1-NEXT: vzeroupper
586586
; AVX1-NEXT: retq
587587
;

0 commit comments

Comments
 (0)