Skip to content

Commit

Permalink
DAG: Avoid forming shufflevector from a single extract_vector_elt (ll…
Browse files Browse the repository at this point in the history
…vm#122672)

This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.

Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.
  • Loading branch information
arsenm authored Jan 17, 2025
1 parent afc43a7 commit 7475f0a
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 10 deletions.
36 changes: 31 additions & 5 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23807,6 +23807,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
SmallVector<SDValue, 8> VecIn;
VecIn.push_back(SDValue());

// If we have a single extract_element with a constant index, track the index
// value.
unsigned OneConstExtractIndex = ~0u;

// Count the number of extract_vector_elt sources (i.e. non-constant or undef)
unsigned NumExtracts = 0;

for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);

Expand All @@ -23824,23 +23831,28 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {

// Not an undef or zero. If the input is something other than an
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(Op.getOperand(1)))
if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
SDValue ExtractedFromVec = Op.getOperand(0);

SDValue ExtractedFromVec = Op.getOperand(0);
if (ExtractedFromVec.getValueType().isScalableVector())
return SDValue();
auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!ExtractIdx)
return SDValue();

const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
if (ExtractIdx->getAsAPIntVal().uge(
ExtractedFromVec.getValueType().getVectorNumElements()))
return SDValue();

// All inputs must have the same element type as the output.
if (VT.getVectorElementType() !=
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();

OneConstExtractIndex = ExtractIdx->getZExtValue();
++NumExtracts;

// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
// a map back from SDValues to numbers isn't worth it.
Expand All @@ -23863,6 +23875,20 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// VecIn accordingly.
bool DidSplitVec = false;
if (VecIn.size() == 2) {
// If we only found a single constant indexed extract_vector_elt feeding the
// build_vector, do not produce a more complicated shuffle if the extract is
// cheap with other constant/undef elements. Skip broadcast patterns with
// multiple uses in the build_vector.

// TODO: This should be more aggressive about skipping the shuffle
// formation, particularly if VecIn[1].hasOneUse(), and regardless of the
// index.
if (NumExtracts == 1 &&
TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
TLI.isTypeLegal(VT.getVectorElementType()) &&
TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
return SDValue();

unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
SDValue Vec = VecIn.back();
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
Expand Down

0 comments on commit 7475f0a

Please sign in to comment.