[AArch64] Add lane moves to PerfectShuffle tables

davemgreen · davemgreen · commit 73dc996428ae · 2022-04-19T14:49:50.000+01:00
This teaches the perfect shuffle tables about lane inserts, that can help reduce the cost of many entries. Many of the shuffle masks are one-away from being correct, and a simple lane move can be a lot simpler than trying to use ext/zip/etc. Because they are not exactly like the other masks handled in the perfect shuffle tables, they require special casing to generate them, with a special InsOp Operator. The lane to insert into is encoded as the RHSID, and the move from is grabbed from the original mask. This helps reduce the maximum perfect shuffle entry cost to 3, with many more shuffles being generatable in a single instruction. Differential Revision: https://reviews.llvm.org/D123386
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9745,8 +9745,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+/// the specified operations to build the shuffle. ID is the perfect-shuffle
+//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
+//table entry and LHS/RHS are the immediate inputs for this stage of the
+//shuffle.
+static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
+                                      SDValue V2, unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
@@ -9763,12 +9767,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     OP_VEXT1,
     OP_VEXT2,
     OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
+    OP_VUZPL,  // VUZP, left result
+    OP_VUZPR,  // VUZP, right result
+    OP_VZIPL,  // VZIP, left result
+    OP_VZIPR,  // VZIP, right result
+    OP_VTRNL,  // VTRN, left result
+    OP_VTRNR,  // VTRN, right result
+    OP_MOVLANE // Move lane. RHSID is the lane to move into
   };
 
   if (OpNum == OP_COPY) {
@@ -9778,9 +9783,48 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     return RHS;
   }
 
+  if (OpNum == OP_MOVLANE) {
+    // Decompose a PerfectShuffle ID to get the Mask for lane Elt
+    auto getPFIDLane = [](unsigned ID, int Elt) -> int {
+      assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
+      Elt = 3 - Elt;
+      while (Elt > 0) {
+        ID /= 9;
+        Elt--;
+      }
+      return (ID % 9 == 8) ? -1 : ID % 9;
+    };
+
+    // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
+    // get the lane to move from from the PFID, which is always from the
+    // original vectors (V1 or V2).
+    SDValue OpLHS = GeneratePerfectShuffle(
+        LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+    EVT VT = OpLHS.getValueType();
+    assert(RHSID < 8 && "Expected a lane index for RHSID!");
+    int MaskElt = getPFIDLane(ID, RHSID);
+    assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+    unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+    SDValue Input = MaskElt < 4 ? V1 : V2;
+    // Be careful about creating illegal types. Use f16 instead of i16.
+    if (VT == MVT::v4i16) {
+      Input = DAG.getBitcast(MVT::v4f16, Input);
+      OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
+    }
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              Input.getValueType().getVectorElementType(),
+                              Input, DAG.getVectorIdxConstant(ExtLane, dl));
+    SDValue Ins =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
+                    Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
+    return DAG.getBitcast(VT, Ins);
+  }
+
   SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
+                                 RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
+                                 RHS, DAG, dl);
   EVT VT = OpLHS.getValueType();
 
   switch (OpNum) {
@@ -10239,7 +10283,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+    return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
+                                  dl);
   }
 
   return GenerateTBL(Op, ShuffleMask, DAG);
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -446,10 +446,9 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
 define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) {
 ; CHECK-LABEL: disguised_dup:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1.4s v1, v0, v0
-; CHECK-NEXT:    uzp2.4s v2, v0, v1
+; CHECK-NEXT:    ext.16b v1, v0, v0, #4
+; CHECK-NEXT:    mov.s v1[2], v0[0]
 ; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    uzp1.4s v1, v2, v1
 ; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -559,10 +559,9 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest
 ; CHECK-LABEL: float_vrev64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    add x8, x0, #12
 ; CHECK-NEXT:    dup.4s v0, v0[0]
-; CHECK-NEXT:    trn2.4s v1, v1, v0
-; CHECK-NEXT:    ext.16b v0, v1, v0, #4
+; CHECK-NEXT:    ld1.s { v0 }[1], [x8]
 ; CHECK-NEXT:    str q0, [x1, #176]
 ; CHECK-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -104,106 +104,103 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
 ; CHECK-NEXT:    rev64 v5.4s, v2.4s
 ; CHECK-NEXT:    add v16.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    add v17.4s, v3.4s, v6.4s
-; CHECK-NEXT:    add v22.4s, v1.4s, v4.4s
-; CHECK-NEXT:    uzp2 v18.4s, v17.4s, v16.4s
-; CHECK-NEXT:    uzp2 v19.4s, v16.4s, v17.4s
-; CHECK-NEXT:    add v21.4s, v2.4s, v5.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v6.4s
-; CHECK-NEXT:    trn2 v6.4s, v16.4s, v17.4s
-; CHECK-NEXT:    trn2 v20.4s, v17.4s, v16.4s
+; CHECK-NEXT:    uzp2 v7.4s, v17.4s, v16.4s
+; CHECK-NEXT:    zip2 v18.4s, v0.4s, v3.4s
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    uzp2 v3.4s, v16.4s, v17.4s
+; CHECK-NEXT:    add v20.4s, v2.4s, v5.4s
+; CHECK-NEXT:    add v21.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
 ; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
-; CHECK-NEXT:    zip1 v4.4s, v22.4s, v21.4s
-; CHECK-NEXT:    uzp2 v17.4s, v18.4s, v17.4s
-; CHECK-NEXT:    zip2 v18.4s, v22.4s, v21.4s
-; CHECK-NEXT:    uzp2 v16.4s, v19.4s, v16.4s
-; CHECK-NEXT:    zip1 v5.4s, v1.4s, v2.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    trn2 v6.4s, v16.4s, v17.4s
+; CHECK-NEXT:    trn2 v19.4s, v17.4s, v16.4s
+; CHECK-NEXT:    zip1 v4.4s, v21.4s, v20.4s
+; CHECK-NEXT:    uzp2 v5.4s, v7.4s, v17.4s
+; CHECK-NEXT:    zip2 v7.4s, v21.4s, v20.4s
+; CHECK-NEXT:    zip1 v17.4s, v1.4s, v2.4s
+; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v16.4s
 ; CHECK-NEXT:    mov v6.d[1], v4.d[1]
-; CHECK-NEXT:    mov v16.d[1], v18.d[1]
-; CHECK-NEXT:    zip2 v7.4s, v0.4s, v3.4s
-; CHECK-NEXT:    ext v5.16b, v1.16b, v5.16b, #8
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    mov v5.d[1], v7.d[1]
+; CHECK-NEXT:    ext v16.16b, v1.16b, v17.16b, #8
+; CHECK-NEXT:    mov v3.d[1], v7.d[1]
+; CHECK-NEXT:    mov v19.d[1], v4.d[1]
 ; CHECK-NEXT:    mov v1.s[3], v2.s[2]
-; CHECK-NEXT:    sub v3.4s, v6.4s, v16.4s
-; CHECK-NEXT:    mov v17.d[1], v18.d[1]
-; CHECK-NEXT:    mov v20.d[1], v4.d[1]
-; CHECK-NEXT:    rev64 v6.4s, v3.4s
-; CHECK-NEXT:    mov v0.d[1], v5.d[1]
-; CHECK-NEXT:    mov v7.d[1], v1.d[1]
-; CHECK-NEXT:    add v2.4s, v17.4s, v20.4s
-; CHECK-NEXT:    add v5.4s, v3.4s, v6.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v6.4s
-; CHECK-NEXT:    sub v6.4s, v0.4s, v7.4s
-; CHECK-NEXT:    add v0.4s, v7.4s, v0.4s
+; CHECK-NEXT:    mov v0.d[1], v16.d[1]
+; CHECK-NEXT:    sub v2.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v5.4s, v19.4s
+; CHECK-NEXT:    mov v18.d[1], v1.d[1]
+; CHECK-NEXT:    rev64 v5.4s, v3.4s
 ; CHECK-NEXT:    rev64 v4.4s, v2.4s
-; CHECK-NEXT:    rev64 v7.4s, v6.4s
-; CHECK-NEXT:    rev64 v16.4s, v0.4s
+; CHECK-NEXT:    sub v7.4s, v0.4s, v18.4s
+; CHECK-NEXT:    add v0.4s, v18.4s, v0.4s
+; CHECK-NEXT:    add v6.4s, v3.4s, v5.4s
+; CHECK-NEXT:    rev64 v16.4s, v7.4s
+; CHECK-NEXT:    rev64 v17.4s, v0.4s
+; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
+; CHECK-NEXT:    rev64 v5.4s, v6.4s
 ; CHECK-NEXT:    add v1.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v18.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v19.4s, v0.4s, v17.4s
+; CHECK-NEXT:    sub v7.4s, v7.4s, v16.4s
+; CHECK-NEXT:    sub v0.4s, v0.4s, v17.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    ext v4.16b, v3.16b, v5.16b, #12
-; CHECK-NEXT:    add v5.4s, v6.4s, v7.4s
-; CHECK-NEXT:    add v17.4s, v0.4s, v16.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v16.4s
-; CHECK-NEXT:    sub v6.4s, v6.4s, v7.4s
-; CHECK-NEXT:    ext v7.16b, v0.16b, v17.16b, #12
-; CHECK-NEXT:    ext v5.16b, v6.16b, v5.16b, #12
-; CHECK-NEXT:    rev64 v22.4s, v1.4s
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    ext v16.16b, v4.16b, v3.16b, #4
-; CHECK-NEXT:    ext v17.16b, v4.16b, v4.16b, #8
-; CHECK-NEXT:    ext v18.16b, v7.16b, v0.16b, #4
-; CHECK-NEXT:    ext v19.16b, v7.16b, v7.16b, #8
-; CHECK-NEXT:    ext v20.16b, v5.16b, v6.16b, #4
-; CHECK-NEXT:    ext v21.16b, v5.16b, v5.16b, #8
+; CHECK-NEXT:    trn2 v4.4s, v5.4s, v3.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v1.16b, #12
+; CHECK-NEXT:    ext v16.16b, v0.16b, v19.16b, #12
+; CHECK-NEXT:    ext v17.16b, v7.16b, v18.16b, #12
+; CHECK-NEXT:    ext v6.16b, v6.16b, v6.16b, #4
 ; CHECK-NEXT:    rev64 v5.4s, v5.4s
-; CHECK-NEXT:    rev64 v7.4s, v7.4s
-; CHECK-NEXT:    rev64 v4.4s, v4.4s
-; CHECK-NEXT:    trn2 v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    ext v16.16b, v16.16b, v17.16b, #12
-; CHECK-NEXT:    ext v17.16b, v18.16b, v19.16b, #12
-; CHECK-NEXT:    ext v18.16b, v20.16b, v21.16b, #12
-; CHECK-NEXT:    trn2 v19.4s, v22.4s, v2.4s
-; CHECK-NEXT:    ext v2.16b, v5.16b, v6.16b, #4
-; CHECK-NEXT:    ext v0.16b, v7.16b, v0.16b, #4
-; CHECK-NEXT:    ext v3.16b, v4.16b, v3.16b, #4
-; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    add v4.4s, v18.4s, v2.4s
-; CHECK-NEXT:    add v5.4s, v17.4s, v0.4s
-; CHECK-NEXT:    add v6.4s, v16.4s, v3.4s
-; CHECK-NEXT:    add v7.4s, v19.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v18.4s, v2.4s
-; CHECK-NEXT:    sub v0.4s, v17.4s, v0.4s
-; CHECK-NEXT:    sub v1.4s, v19.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v16.4s, v3.4s
-; CHECK-NEXT:    mov v7.d[1], v1.d[1]
-; CHECK-NEXT:    mov v6.d[1], v3.d[1]
-; CHECK-NEXT:    mov v4.d[1], v2.d[1]
-; CHECK-NEXT:    mov v5.d[1], v0.d[1]
+; CHECK-NEXT:    rev64 v16.4s, v16.4s
+; CHECK-NEXT:    rev64 v17.4s, v17.4s
+; CHECK-NEXT:    mov v1.s[3], v2.s[3]
+; CHECK-NEXT:    mov v19.s[3], v0.s[3]
+; CHECK-NEXT:    mov v18.s[3], v7.s[3]
+; CHECK-NEXT:    ext v16.16b, v16.16b, v0.16b, #4
+; CHECK-NEXT:    ext v17.16b, v17.16b, v7.16b, #4
+; CHECK-NEXT:    ext v5.16b, v5.16b, v2.16b, #4
+; CHECK-NEXT:    trn2 v3.4s, v3.4s, v6.4s
+; CHECK-NEXT:    sub v20.4s, v19.4s, v16.4s
+; CHECK-NEXT:    sub v21.4s, v18.4s, v17.4s
+; CHECK-NEXT:    sub v6.4s, v1.4s, v5.4s
+; CHECK-NEXT:    mov v18.s[0], v7.s[0]
+; CHECK-NEXT:    mov v19.s[0], v0.s[0]
+; CHECK-NEXT:    ext v0.16b, v3.16b, v3.16b, #4
+; CHECK-NEXT:    mov v1.s[0], v2.s[0]
+; CHECK-NEXT:    add v2.4s, v18.4s, v17.4s
+; CHECK-NEXT:    add v3.4s, v19.4s, v16.4s
+; CHECK-NEXT:    add v7.4s, v4.4s, v0.4s
+; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    mov v7.d[1], v0.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    mov v2.d[1], v21.d[1]
+; CHECK-NEXT:    mov v3.d[1], v20.d[1]
 ; CHECK-NEXT:    movi v0.8h, #1
 ; CHECK-NEXT:    movi v17.2d, #0x00ffff0000ffff
-; CHECK-NEXT:    ushr v1.4s, v4.4s, #15
-; CHECK-NEXT:    ushr v2.4s, v7.4s, #15
-; CHECK-NEXT:    ushr v3.4s, v5.4s, #15
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #15
-; CHECK-NEXT:    and v2.16b, v2.16b, v0.16b
+; CHECK-NEXT:    ushr v4.4s, v2.4s, #15
+; CHECK-NEXT:    ushr v5.4s, v7.4s, #15
+; CHECK-NEXT:    ushr v6.4s, v3.4s, #15
+; CHECK-NEXT:    ushr v16.4s, v1.4s, #15
+; CHECK-NEXT:    and v5.16b, v5.16b, v0.16b
 ; CHECK-NEXT:    and v16.16b, v16.16b, v0.16b
-; CHECK-NEXT:    and v3.16b, v3.16b, v0.16b
-; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    mul v1.4s, v2.4s, v17.4s
-; CHECK-NEXT:    mul v2.4s, v16.4s, v17.4s
+; CHECK-NEXT:    and v6.16b, v6.16b, v0.16b
+; CHECK-NEXT:    and v0.16b, v4.16b, v0.16b
+; CHECK-NEXT:    mul v4.4s, v5.4s, v17.4s
+; CHECK-NEXT:    mul v5.4s, v16.4s, v17.4s
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v17.4s
-; CHECK-NEXT:    mul v3.4s, v3.4s, v17.4s
-; CHECK-NEXT:    add v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    add v6.4s, v2.4s, v6.4s
-; CHECK-NEXT:    add v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v5.4s, v3.4s, v5.4s
-; CHECK-NEXT:    eor v0.16b, v4.16b, v0.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v2.16b
-; CHECK-NEXT:    eor v1.16b, v7.16b, v1.16b
-; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    mul v6.4s, v6.4s, v17.4s
+; CHECK-NEXT:    add v7.4s, v4.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    eor v0.16b, v2.16b, v0.16b
+; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v1.16b, v5.16b
+; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
+; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -939,9 +939,11 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
 define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-LABEL: vselect_equivalent_shuffle_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #2
-; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
-; CHECK-NEXT:    ext v0.8b, v0.8b, v0.8b, #2
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.h[2], v1.h[1]
+; CHECK-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
   ret <4 x i16> %c
@@ -950,9 +952,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
 define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vselect_equivalent_shuffle_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
-; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
-; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
+; CHECK-NEXT:    mov v0.s[2], v1.s[1]
 ; CHECK-NEXT:    ret
   %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
   ret <4 x i32> %c
diff --git a/llvm/test/CodeGen/AArch64/neon-wide-splat.ll b/llvm/test/CodeGen/AArch64/neon-wide-splat.ll
@@ -109,9 +109,8 @@ define <8 x i8> @shuffle_not1(<16 x i8> %v) {
 define <4 x i32> @shuffle_not2(<4 x i32> %v) {
 ; CHECK-LABEL: shuffle_not2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    trn1 v1.4s, v0.4s, v0.4s
-; CHECK-NEXT:    zip1 v0.4s, v0.4s, v0.4s
-; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.s[3], v0.s[2]
+; CHECK-NEXT:    uzp2 v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    ret
 entry:
   %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -517,9 +517,8 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
 define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; CHECK-LABEL: shuffle3_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    uzp1 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v0.16b, v1.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-NEXT:    dup v1.4s, v2.s[0]
 ; CHECK-NEXT:    mov v0.s[2], v1.s[2]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/shuffles.ll b/llvm/test/CodeGen/AArch64/shuffles.ll
diff --git a/llvm/utils/PerfectShuffle/PerfectShuffle.cpp b/llvm/utils/PerfectShuffle/PerfectShuffle.cpp