Skip to content

Commit 8961f3e

Browse files
authored
[X86] shouldReduceLoadWidth - don't split loads if we can freely reuse full width legal binop (llvm#129695)
Currently shouldReduceLoadWidth is very relaxed about when loads can be split to avoid extractions from the original full width load - resulting in many cases where the number of memory operations notably increases, replacing the cost of a extract_subvector for additional loads. This patch adjusts the 256/512-bit vector load splitting metric to detect cases where ANY use of the full width load is be used directly - in which case we will now reuse that load for smaller types, unless we'd need to extract an upper subvector / integer element - i.e. we now correctly treat (extract_subvector cst, 0) as free. We retain the existing logic of never splitting loads if all uses are extract+stores but we improve this by peeking through bitcasts while looking for extract_subvector/store chains. This required a number of fixes - shouldReduceLoadWidth now needs to peek through bitcasts UP the use-chain to find final users (limited to hasOneUse cases to reduce complexity). It also exposed an issue in isTargetCanonicalConstantNode which assumed that a load of vector constant data would always extract, which is no longer the case.
1 parent bb9fa77 commit 8961f3e

12 files changed

+2480
-2453
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3264,6 +3264,12 @@ bool X86TargetLowering::shouldReduceLoadWidth(
32643264
std::optional<unsigned> ByteOffset) const {
32653265
assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
32663266

3267+
auto PeekThroughOneUserBitcasts = [](const SDNode *N) {
3268+
while (N->getOpcode() == ISD::BITCAST && N->hasOneUse())
3269+
N = *N->user_begin();
3270+
return N;
3271+
};
3272+
32673273
// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
32683274
// relocation target a movq or addq instruction: don't let the load shrink.
32693275
SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
@@ -3273,24 +3279,46 @@ bool X86TargetLowering::shouldReduceLoadWidth(
32733279

32743280
// If this is an (1) AVX vector load with (2) multiple uses and (3) all of
32753281
// those uses are extracted directly into a store, then the extract + store
3276-
// can be store-folded. Therefore, it's probably not worth splitting the load.
3282+
// can be store-folded, or (4) any use will be used by legal full width
3283+
// instruction. Then, it's probably not worth splitting the load.
32773284
EVT VT = Load->getValueType(0);
32783285
if ((VT.is256BitVector() || VT.is512BitVector()) &&
32793286
!SDValue(Load, 0).hasOneUse()) {
3287+
bool FullWidthUse = false;
3288+
bool AllExtractStores = true;
32803289
for (SDUse &Use : Load->uses()) {
32813290
// Skip uses of the chain value. Result 0 of the node is the load value.
32823291
if (Use.getResNo() != 0)
32833292
continue;
32843293

3285-
SDNode *User = Use.getUser();
3294+
const SDNode *User = PeekThroughOneUserBitcasts(Use.getUser());
32863295

3287-
// If this use is not an extract + store, it's probably worth splitting.
3288-
if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3289-
User->user_begin()->getOpcode() != ISD::STORE)
3290-
return true;
3296+
// If this use is an extract + store, it's probably not worth splitting.
3297+
if (User->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
3298+
all_of(User->uses(), [&](const SDUse &U) {
3299+
const SDNode *Inner = PeekThroughOneUserBitcasts(U.getUser());
3300+
return Inner->getOpcode() == ISD::STORE;
3301+
}))
3302+
continue;
3303+
3304+
AllExtractStores = false;
3305+
3306+
// If any use is a full width legal/target bin op, then assume its legal
3307+
// and won't split.
3308+
if (isBinOp(User->getOpcode()) &&
3309+
(isOperationLegal(User->getOpcode(), User->getValueType(0)) ||
3310+
User->getOpcode() > ISD::BUILTIN_OP_END))
3311+
FullWidthUse = true;
32913312
}
3292-
// All non-chain uses are extract + store.
3293-
return false;
3313+
3314+
if (AllExtractStores)
3315+
return false;
3316+
3317+
// If we have an user that uses the full vector width, then this use is
3318+
// only worth splitting if the offset isn't 0 (to avoid an
3319+
// EXTRACT_SUBVECTOR) or we're loading a scalar integer.
3320+
if (FullWidthUse)
3321+
return (ByteOffset.value_or(0) > 0) || NewVT.isScalarInteger();
32943322
}
32953323

32963324
return true;
@@ -59154,6 +59182,14 @@ static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,
5915459182
return Res;
5915559183
}
5915659184

59185+
// Match insertion of subvector load that perfectly aliases a base load.
59186+
if ((IdxVal % SubVecNumElts) == 0 && ISD::isNormalLoad(Vec.getNode()) &&
59187+
ISD::isNormalLoad(SubVec.getNode()) &&
59188+
DAG.areNonVolatileConsecutiveLoads(
59189+
cast<LoadSDNode>(SubVec), cast<LoadSDNode>(Vec),
59190+
SubVec.getValueSizeInBits() / 8, IdxVal / SubVecNumElts))
59191+
return Vec;
59192+
5915759193
return SDValue();
5915859194
}
5915959195

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,15 +1337,18 @@ namespace llvm {
13371337
unsigned Depth) const override;
13381338

13391339
bool isTargetCanonicalConstantNode(SDValue Op) const override {
1340-
// Peek through bitcasts/extracts/inserts to see if we have a broadcast
1341-
// vector from memory.
1340+
// Peek through bitcasts/extracts/inserts to see if we have a vector
1341+
// load/broadcast from memory.
13421342
while (Op.getOpcode() == ISD::BITCAST ||
13431343
Op.getOpcode() == ISD::EXTRACT_SUBVECTOR ||
13441344
(Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
13451345
Op.getOperand(0).isUndef()))
13461346
Op = Op.getOperand(Op.getOpcode() == ISD::INSERT_SUBVECTOR ? 1 : 0);
13471347

13481348
return Op.getOpcode() == X86ISD::VBROADCAST_LOAD ||
1349+
Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
1350+
(Op.getOpcode() == ISD::LOAD &&
1351+
getTargetConstantFromLoad(cast<LoadSDNode>(Op))) ||
13491352
TargetLowering::isTargetCanonicalConstantNode(Op);
13501353
}
13511354

llvm/test/CodeGen/X86/oddsubvector.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -261,9 +261,9 @@ define void @PR42833() {
261261
;
262262
; AVX2-LABEL: PR42833:
263263
; AVX2: # %bb.0:
264-
; AVX2-NEXT: movl b(%rip), %eax
265264
; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0
266-
; AVX2-NEXT: addl c+128(%rip), %eax
265+
; AVX2-NEXT: vmovd %xmm0, %eax
266+
; AVX2-NEXT: addl b(%rip), %eax
267267
; AVX2-NEXT: vmovd %eax, %xmm1
268268
; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2
269269
; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3
@@ -284,10 +284,10 @@ define void @PR42833() {
284284
;
285285
; AVX512-LABEL: PR42833:
286286
; AVX512: # %bb.0:
287-
; AVX512-NEXT: movl b(%rip), %eax
288287
; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0
289288
; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1
290-
; AVX512-NEXT: addl c+128(%rip), %eax
289+
; AVX512-NEXT: vmovd %xmm0, %eax
290+
; AVX512-NEXT: addl b(%rip), %eax
291291
; AVX512-NEXT: vmovd %eax, %xmm2
292292
; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2
293293
; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0

llvm/test/CodeGen/X86/setcc-lowering.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ define <8 x i16> @pr25080(<8 x i32> %a) nounwind {
1111
; AVX1-LABEL: pr25080:
1212
; AVX1: # %bb.0: # %entry
1313
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
14-
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
14+
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
1515
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
1616
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
1717
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0

0 commit comments

Comments
 (0)