Skip to content

Commit

Permalink
[AArch64] Add lane moves to PerfectShuffle tables
Browse files Browse the repository at this point in the history
This teaches the perfect shuffle tables about lane inserts, that can
help reduce the cost of many entries. Many of the shuffle masks are
one-away from being correct, and a simple lane move can be a lot simpler
than trying to use ext/zip/etc. Because they are not exactly like the
other masks handled in the perfect shuffle tables, they require special
casing to generate them, with a special InsOp Operator.

The lane to insert into is encoded as the RHSID, and the move from is
grabbed from the original mask. This helps reduce the maximum perfect
shuffle entry cost to 3, with many more shuffles being generatable in a
single instruction.

Differential Revision: https://reviews.llvm.org/D123386
  • Loading branch information
davemgreen committed Apr 19, 2022
1 parent 7adfa31 commit 73dc996
Show file tree
Hide file tree
Showing 10 changed files with 2,741 additions and 2,651 deletions.
67 changes: 56 additions & 11 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9745,8 +9745,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
}

/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
/// the specified operations to build the shuffle. ID is the perfect-shuffle
//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
//table entry and LHS/RHS are the immediate inputs for this stage of the
//shuffle.
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
SDValue V2, unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
Expand All @@ -9763,12 +9767,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR, // VTRN, right result
OP_MOVLANE // Move lane. RHSID is the lane to move into
};

if (OpNum == OP_COPY) {
Expand All @@ -9778,9 +9783,48 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
return RHS;
}

if (OpNum == OP_MOVLANE) {
// Decompose a PerfectShuffle ID to get the Mask for lane Elt
auto getPFIDLane = [](unsigned ID, int Elt) -> int {
assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
Elt = 3 - Elt;
while (Elt > 0) {
ID /= 9;
Elt--;
}
return (ID % 9 == 8) ? -1 : ID % 9;
};

// For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
// get the lane to move from from the PFID, which is always from the
// original vectors (V1 or V2).
SDValue OpLHS = GeneratePerfectShuffle(
LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
assert(RHSID < 8 && "Expected a lane index for RHSID!");
int MaskElt = getPFIDLane(ID, RHSID);
assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
SDValue Input = MaskElt < 4 ? V1 : V2;
// Be careful about creating illegal types. Use f16 instead of i16.
if (VT == MVT::v4i16) {
Input = DAG.getBitcast(MVT::v4f16, Input);
OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
}
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
Input.getValueType().getVectorElementType(),
Input, DAG.getVectorIdxConstant(ExtLane, dl));
SDValue Ins =
DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
return DAG.getBitcast(VT, Ins);
}

SDValue OpLHS, OpRHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
RHS, DAG, dl);
EVT VT = OpLHS.getValueType();

switch (OpNum) {
Expand Down Expand Up @@ -10239,7 +10283,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
dl);
}

return GenerateTBL(Op, ShuffleMask, DAG);
Expand Down
5,046 changes: 2,523 additions & 2,523 deletions llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/arm64-dup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -446,10 +446,9 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) {
; CHECK-LABEL: disguised_dup:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1.4s v1, v0, v0
; CHECK-NEXT: uzp2.4s v2, v0, v1
; CHECK-NEXT: ext.16b v1, v0, v0, #4
; CHECK-NEXT: mov.s v1[2], v0[0]
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: uzp1.4s v1, v2, v1
; CHECK-NEXT: str q1, [x0]
; CHECK-NEXT: str q0, [x1]
; CHECK-NEXT: ret
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/arm64-rev.ll
Original file line number Diff line number Diff line change
Expand Up @@ -559,10 +559,9 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest
; CHECK-LABEL: float_vrev64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: ldr q1, [x0]
; CHECK-NEXT: add x8, x0, #12
; CHECK-NEXT: dup.4s v0, v0[0]
; CHECK-NEXT: trn2.4s v1, v1, v0
; CHECK-NEXT: ext.16b v0, v1, v0, #4
; CHECK-NEXT: ld1.s { v0 }[1], [x8]
; CHECK-NEXT: str q0, [x1, #176]
; CHECK-NEXT: ret
;
Expand Down
171 changes: 84 additions & 87 deletions llvm/test/CodeGen/AArch64/insert-extend.ll
Original file line number Diff line number Diff line change
Expand Up @@ -104,106 +104,103 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
; CHECK-NEXT: rev64 v5.4s, v2.4s
; CHECK-NEXT: add v16.4s, v0.4s, v7.4s
; CHECK-NEXT: add v17.4s, v3.4s, v6.4s
; CHECK-NEXT: add v22.4s, v1.4s, v4.4s
; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s
; CHECK-NEXT: uzp2 v19.4s, v16.4s, v17.4s
; CHECK-NEXT: add v21.4s, v2.4s, v5.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
; CHECK-NEXT: uzp2 v7.4s, v17.4s, v16.4s
; CHECK-NEXT: zip2 v18.4s, v0.4s, v3.4s
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
; CHECK-NEXT: uzp2 v3.4s, v16.4s, v17.4s
; CHECK-NEXT: add v20.4s, v2.4s, v5.4s
; CHECK-NEXT: add v21.4s, v1.4s, v4.4s
; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
; CHECK-NEXT: zip1 v4.4s, v22.4s, v21.4s
; CHECK-NEXT: uzp2 v17.4s, v18.4s, v17.4s
; CHECK-NEXT: zip2 v18.4s, v22.4s, v21.4s
; CHECK-NEXT: uzp2 v16.4s, v19.4s, v16.4s
; CHECK-NEXT: zip1 v5.4s, v1.4s, v2.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
; CHECK-NEXT: trn2 v19.4s, v17.4s, v16.4s
; CHECK-NEXT: zip1 v4.4s, v21.4s, v20.4s
; CHECK-NEXT: uzp2 v5.4s, v7.4s, v17.4s
; CHECK-NEXT: zip2 v7.4s, v21.4s, v20.4s
; CHECK-NEXT: zip1 v17.4s, v1.4s, v2.4s
; CHECK-NEXT: uzp2 v3.4s, v3.4s, v16.4s
; CHECK-NEXT: mov v6.d[1], v4.d[1]
; CHECK-NEXT: mov v16.d[1], v18.d[1]
; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s
; CHECK-NEXT: ext v5.16b, v1.16b, v5.16b, #8
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
; CHECK-NEXT: mov v5.d[1], v7.d[1]
; CHECK-NEXT: ext v16.16b, v1.16b, v17.16b, #8
; CHECK-NEXT: mov v3.d[1], v7.d[1]
; CHECK-NEXT: mov v19.d[1], v4.d[1]
; CHECK-NEXT: mov v1.s[3], v2.s[2]
; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s
; CHECK-NEXT: mov v17.d[1], v18.d[1]
; CHECK-NEXT: mov v20.d[1], v4.d[1]
; CHECK-NEXT: rev64 v6.4s, v3.4s
; CHECK-NEXT: mov v0.d[1], v5.d[1]
; CHECK-NEXT: mov v7.d[1], v1.d[1]
; CHECK-NEXT: add v2.4s, v17.4s, v20.4s
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s
; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
; CHECK-NEXT: mov v0.d[1], v16.d[1]
; CHECK-NEXT: sub v2.4s, v6.4s, v3.4s
; CHECK-NEXT: add v3.4s, v5.4s, v19.4s
; CHECK-NEXT: mov v18.d[1], v1.d[1]
; CHECK-NEXT: rev64 v5.4s, v3.4s
; CHECK-NEXT: rev64 v4.4s, v2.4s
; CHECK-NEXT: rev64 v7.4s, v6.4s
; CHECK-NEXT: rev64 v16.4s, v0.4s
; CHECK-NEXT: sub v7.4s, v0.4s, v18.4s
; CHECK-NEXT: add v0.4s, v18.4s, v0.4s
; CHECK-NEXT: add v6.4s, v3.4s, v5.4s
; CHECK-NEXT: rev64 v16.4s, v7.4s
; CHECK-NEXT: rev64 v17.4s, v0.4s
; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
; CHECK-NEXT: rev64 v5.4s, v6.4s
; CHECK-NEXT: add v1.4s, v2.4s, v4.4s
; CHECK-NEXT: add v18.4s, v7.4s, v16.4s
; CHECK-NEXT: add v19.4s, v0.4s, v17.4s
; CHECK-NEXT: sub v7.4s, v7.4s, v16.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v17.4s
; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s
; CHECK-NEXT: ext v4.16b, v3.16b, v5.16b, #12
; CHECK-NEXT: add v5.4s, v6.4s, v7.4s
; CHECK-NEXT: add v17.4s, v0.4s, v16.4s
; CHECK-NEXT: sub v0.4s, v0.4s, v16.4s
; CHECK-NEXT: sub v6.4s, v6.4s, v7.4s
; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #12
; CHECK-NEXT: ext v5.16b, v6.16b, v5.16b, #12
; CHECK-NEXT: rev64 v22.4s, v1.4s
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4
; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #4
; CHECK-NEXT: ext v19.16b, v7.16b, v7.16b, #8
; CHECK-NEXT: ext v20.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: ext v21.16b, v5.16b, v5.16b, #8
; CHECK-NEXT: trn2 v4.4s, v5.4s, v3.4s
; CHECK-NEXT: ext v5.16b, v2.16b, v1.16b, #12
; CHECK-NEXT: ext v16.16b, v0.16b, v19.16b, #12
; CHECK-NEXT: ext v17.16b, v7.16b, v18.16b, #12
; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4
; CHECK-NEXT: rev64 v5.4s, v5.4s
; CHECK-NEXT: rev64 v7.4s, v7.4s
; CHECK-NEXT: rev64 v4.4s, v4.4s
; CHECK-NEXT: trn2 v1.4s, v2.4s, v1.4s
; CHECK-NEXT: ext v16.16b, v16.16b, v17.16b, #12
; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
; CHECK-NEXT: trn2 v19.4s, v22.4s, v2.4s
; CHECK-NEXT: ext v2.16b, v5.16b, v6.16b, #4
; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #4
; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #4
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
; CHECK-NEXT: add v4.4s, v18.4s, v2.4s
; CHECK-NEXT: add v5.4s, v17.4s, v0.4s
; CHECK-NEXT: add v6.4s, v16.4s, v3.4s
; CHECK-NEXT: add v7.4s, v19.4s, v1.4s
; CHECK-NEXT: sub v2.4s, v18.4s, v2.4s
; CHECK-NEXT: sub v0.4s, v17.4s, v0.4s
; CHECK-NEXT: sub v1.4s, v19.4s, v1.4s
; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s
; CHECK-NEXT: mov v7.d[1], v1.d[1]
; CHECK-NEXT: mov v6.d[1], v3.d[1]
; CHECK-NEXT: mov v4.d[1], v2.d[1]
; CHECK-NEXT: mov v5.d[1], v0.d[1]
; CHECK-NEXT: rev64 v16.4s, v16.4s
; CHECK-NEXT: rev64 v17.4s, v17.4s
; CHECK-NEXT: mov v1.s[3], v2.s[3]
; CHECK-NEXT: mov v19.s[3], v0.s[3]
; CHECK-NEXT: mov v18.s[3], v7.s[3]
; CHECK-NEXT: ext v16.16b, v16.16b, v0.16b, #4
; CHECK-NEXT: ext v17.16b, v17.16b, v7.16b, #4
; CHECK-NEXT: ext v5.16b, v5.16b, v2.16b, #4
; CHECK-NEXT: trn2 v3.4s, v3.4s, v6.4s
; CHECK-NEXT: sub v20.4s, v19.4s, v16.4s
; CHECK-NEXT: sub v21.4s, v18.4s, v17.4s
; CHECK-NEXT: sub v6.4s, v1.4s, v5.4s
; CHECK-NEXT: mov v18.s[0], v7.s[0]
; CHECK-NEXT: mov v19.s[0], v0.s[0]
; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #4
; CHECK-NEXT: mov v1.s[0], v2.s[0]
; CHECK-NEXT: add v2.4s, v18.4s, v17.4s
; CHECK-NEXT: add v3.4s, v19.4s, v16.4s
; CHECK-NEXT: add v7.4s, v4.4s, v0.4s
; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s
; CHECK-NEXT: add v1.4s, v1.4s, v5.4s
; CHECK-NEXT: mov v7.d[1], v0.d[1]
; CHECK-NEXT: mov v1.d[1], v6.d[1]
; CHECK-NEXT: mov v2.d[1], v21.d[1]
; CHECK-NEXT: mov v3.d[1], v20.d[1]
; CHECK-NEXT: movi v0.8h, #1
; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff
; CHECK-NEXT: ushr v1.4s, v4.4s, #15
; CHECK-NEXT: ushr v2.4s, v7.4s, #15
; CHECK-NEXT: ushr v3.4s, v5.4s, #15
; CHECK-NEXT: ushr v16.4s, v6.4s, #15
; CHECK-NEXT: and v2.16b, v2.16b, v0.16b
; CHECK-NEXT: ushr v4.4s, v2.4s, #15
; CHECK-NEXT: ushr v5.4s, v7.4s, #15
; CHECK-NEXT: ushr v6.4s, v3.4s, #15
; CHECK-NEXT: ushr v16.4s, v1.4s, #15
; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
; CHECK-NEXT: and v16.16b, v16.16b, v0.16b
; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
; CHECK-NEXT: mul v1.4s, v2.4s, v17.4s
; CHECK-NEXT: mul v2.4s, v16.4s, v17.4s
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
; CHECK-NEXT: mul v4.4s, v5.4s, v17.4s
; CHECK-NEXT: mul v5.4s, v16.4s, v17.4s
; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s
; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s
; CHECK-NEXT: add v7.4s, v1.4s, v7.4s
; CHECK-NEXT: add v6.4s, v2.4s, v6.4s
; CHECK-NEXT: add v4.4s, v0.4s, v4.4s
; CHECK-NEXT: add v5.4s, v3.4s, v5.4s
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b
; CHECK-NEXT: eor v2.16b, v6.16b, v2.16b
; CHECK-NEXT: eor v1.16b, v7.16b, v1.16b
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s
; CHECK-NEXT: add v7.4s, v4.4s, v7.4s
; CHECK-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-NEXT: add v3.4s, v6.4s, v3.4s
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b
; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
Expand Down
13 changes: 7 additions & 6 deletions llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -939,9 +939,11 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: vselect_equivalent_shuffle_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #2
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #2
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.h[2], v1.h[1]
; CHECK-NEXT: mov v0.h[1], v1.h[0]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
ret <4 x i16> %c
Expand All @@ -950,9 +952,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: vselect_equivalent_shuffle_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: mov v0.s[2], v1.s[1]
; CHECK-NEXT: ret
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
ret <4 x i32> %c
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/neon-wide-splat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,8 @@ define <8 x i8> @shuffle_not1(<16 x i8> %v) {
define <4 x i32> @shuffle_not2(<4 x i32> %v) {
; CHECK-LABEL: shuffle_not2:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: trn1 v1.4s, v0.4s, v0.4s
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
; CHECK-NEXT: mov v0.s[3], v0.s[2]
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: ret
entry:
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 2>
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
Original file line number Diff line number Diff line change
Expand Up @@ -517,9 +517,8 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; CHECK-LABEL: shuffle3_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
; CHECK-NEXT: mov v0.s[1], v1.s[0]
; CHECK-NEXT: dup v1.4s, v2.s[0]
; CHECK-NEXT: mov v0.s[2], v1.s[2]
; CHECK-NEXT: ret
Expand Down
Loading

0 comments on commit 73dc996

Please sign in to comment.