Skip to content

Commit 73dc996

Browse files
committed
[AArch64] Add lane moves to PerfectShuffle tables
This teaches the perfect shuffle tables about lane inserts, that can help reduce the cost of many entries. Many of the shuffle masks are one-away from being correct, and a simple lane move can be a lot simpler than trying to use ext/zip/etc. Because they are not exactly like the other masks handled in the perfect shuffle tables, they require special casing to generate them, with a special InsOp Operator. The lane to insert into is encoded as the RHSID, and the move from is grabbed from the original mask. This helps reduce the maximum perfect shuffle entry cost to 3, with many more shuffles being generatable in a single instruction. Differential Revision: https://reviews.llvm.org/D123386
1 parent 7adfa31 commit 73dc996

File tree

10 files changed

+2741
-2651
lines changed

10 files changed

+2741
-2651
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 56 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9745,8 +9745,12 @@ static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
97459745
}
97469746

97479747
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
9748-
/// the specified operations to build the shuffle.
9749-
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
9748+
/// the specified operations to build the shuffle. ID is the perfect-shuffle
9749+
//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
9750+
//table entry and LHS/RHS are the immediate inputs for this stage of the
9751+
//shuffle.
9752+
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
9753+
SDValue V2, unsigned PFEntry, SDValue LHS,
97509754
SDValue RHS, SelectionDAG &DAG,
97519755
const SDLoc &dl) {
97529756
unsigned OpNum = (PFEntry >> 26) & 0x0F;
@@ -9763,12 +9767,13 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
97639767
OP_VEXT1,
97649768
OP_VEXT2,
97659769
OP_VEXT3,
9766-
OP_VUZPL, // VUZP, left result
9767-
OP_VUZPR, // VUZP, right result
9768-
OP_VZIPL, // VZIP, left result
9769-
OP_VZIPR, // VZIP, right result
9770-
OP_VTRNL, // VTRN, left result
9771-
OP_VTRNR // VTRN, right result
9770+
OP_VUZPL, // VUZP, left result
9771+
OP_VUZPR, // VUZP, right result
9772+
OP_VZIPL, // VZIP, left result
9773+
OP_VZIPR, // VZIP, right result
9774+
OP_VTRNL, // VTRN, left result
9775+
OP_VTRNR, // VTRN, right result
9776+
OP_MOVLANE // Move lane. RHSID is the lane to move into
97729777
};
97739778

97749779
if (OpNum == OP_COPY) {
@@ -9778,9 +9783,48 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
97789783
return RHS;
97799784
}
97809785

9786+
if (OpNum == OP_MOVLANE) {
9787+
// Decompose a PerfectShuffle ID to get the Mask for lane Elt
9788+
auto getPFIDLane = [](unsigned ID, int Elt) -> int {
9789+
assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
9790+
Elt = 3 - Elt;
9791+
while (Elt > 0) {
9792+
ID /= 9;
9793+
Elt--;
9794+
}
9795+
return (ID % 9 == 8) ? -1 : ID % 9;
9796+
};
9797+
9798+
// For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
9799+
// get the lane to move from from the PFID, which is always from the
9800+
// original vectors (V1 or V2).
9801+
SDValue OpLHS = GeneratePerfectShuffle(
9802+
LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9803+
EVT VT = OpLHS.getValueType();
9804+
assert(RHSID < 8 && "Expected a lane index for RHSID!");
9805+
int MaskElt = getPFIDLane(ID, RHSID);
9806+
assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
9807+
unsigned ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
9808+
SDValue Input = MaskElt < 4 ? V1 : V2;
9809+
// Be careful about creating illegal types. Use f16 instead of i16.
9810+
if (VT == MVT::v4i16) {
9811+
Input = DAG.getBitcast(MVT::v4f16, Input);
9812+
OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
9813+
}
9814+
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
9815+
Input.getValueType().getVectorElementType(),
9816+
Input, DAG.getVectorIdxConstant(ExtLane, dl));
9817+
SDValue Ins =
9818+
DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
9819+
Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
9820+
return DAG.getBitcast(VT, Ins);
9821+
}
9822+
97819823
SDValue OpLHS, OpRHS;
9782-
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
9783-
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
9824+
OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
9825+
RHS, DAG, dl);
9826+
OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
9827+
RHS, DAG, dl);
97849828
EVT VT = OpLHS.getValueType();
97859829

97869830
switch (OpNum) {
@@ -10239,7 +10283,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
1023910283
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
1024010284
PFIndexes[2] * 9 + PFIndexes[3];
1024110285
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10242-
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
10286+
return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
10287+
dl);
1024310288
}
1024410289

1024510290
return GenerateTBL(Op, ShuffleMask, DAG);

llvm/lib/Target/AArch64/AArch64PerfectShuffle.h

Lines changed: 2523 additions & 2523 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AArch64/arm64-dup.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -446,10 +446,9 @@ define <4 x float> @test_perfectshuffle_dupext_v4f32(<4 x float> %a, <4 x float>
446446
define void @disguised_dup(<4 x float> %x, <4 x float>* %p1, <4 x float>* %p2) {
447447
; CHECK-LABEL: disguised_dup:
448448
; CHECK: // %bb.0:
449-
; CHECK-NEXT: uzp1.4s v1, v0, v0
450-
; CHECK-NEXT: uzp2.4s v2, v0, v1
449+
; CHECK-NEXT: ext.16b v1, v0, v0, #4
450+
; CHECK-NEXT: mov.s v1[2], v0[0]
451451
; CHECK-NEXT: dup.4s v0, v0[0]
452-
; CHECK-NEXT: uzp1.4s v1, v2, v1
453452
; CHECK-NEXT: str q1, [x0]
454453
; CHECK-NEXT: str q0, [x1]
455454
; CHECK-NEXT: ret

llvm/test/CodeGen/AArch64/arm64-rev.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -559,10 +559,9 @@ define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest
559559
; CHECK-LABEL: float_vrev64:
560560
; CHECK: // %bb.0: // %entry
561561
; CHECK-NEXT: movi.2d v0, #0000000000000000
562-
; CHECK-NEXT: ldr q1, [x0]
562+
; CHECK-NEXT: add x8, x0, #12
563563
; CHECK-NEXT: dup.4s v0, v0[0]
564-
; CHECK-NEXT: trn2.4s v1, v1, v0
565-
; CHECK-NEXT: ext.16b v0, v1, v0, #4
564+
; CHECK-NEXT: ld1.s { v0 }[1], [x8]
566565
; CHECK-NEXT: str q0, [x1, #176]
567566
; CHECK-NEXT: ret
568567
;

llvm/test/CodeGen/AArch64/insert-extend.ll

Lines changed: 84 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -104,106 +104,103 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
104104
; CHECK-NEXT: rev64 v5.4s, v2.4s
105105
; CHECK-NEXT: add v16.4s, v0.4s, v7.4s
106106
; CHECK-NEXT: add v17.4s, v3.4s, v6.4s
107-
; CHECK-NEXT: add v22.4s, v1.4s, v4.4s
108-
; CHECK-NEXT: uzp2 v18.4s, v17.4s, v16.4s
109-
; CHECK-NEXT: uzp2 v19.4s, v16.4s, v17.4s
110-
; CHECK-NEXT: add v21.4s, v2.4s, v5.4s
107+
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
111108
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
112-
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
113-
; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
109+
; CHECK-NEXT: uzp2 v7.4s, v17.4s, v16.4s
110+
; CHECK-NEXT: zip2 v18.4s, v0.4s, v3.4s
111+
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
112+
; CHECK-NEXT: uzp2 v3.4s, v16.4s, v17.4s
113+
; CHECK-NEXT: add v20.4s, v2.4s, v5.4s
114+
; CHECK-NEXT: add v21.4s, v1.4s, v4.4s
114115
; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s
115116
; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
116-
; CHECK-NEXT: zip1 v4.4s, v22.4s, v21.4s
117-
; CHECK-NEXT: uzp2 v17.4s, v18.4s, v17.4s
118-
; CHECK-NEXT: zip2 v18.4s, v22.4s, v21.4s
119-
; CHECK-NEXT: uzp2 v16.4s, v19.4s, v16.4s
120-
; CHECK-NEXT: zip1 v5.4s, v1.4s, v2.4s
121-
; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
117+
; CHECK-NEXT: trn2 v6.4s, v16.4s, v17.4s
118+
; CHECK-NEXT: trn2 v19.4s, v17.4s, v16.4s
119+
; CHECK-NEXT: zip1 v4.4s, v21.4s, v20.4s
120+
; CHECK-NEXT: uzp2 v5.4s, v7.4s, v17.4s
121+
; CHECK-NEXT: zip2 v7.4s, v21.4s, v20.4s
122+
; CHECK-NEXT: zip1 v17.4s, v1.4s, v2.4s
123+
; CHECK-NEXT: uzp2 v3.4s, v3.4s, v16.4s
122124
; CHECK-NEXT: mov v6.d[1], v4.d[1]
123-
; CHECK-NEXT: mov v16.d[1], v18.d[1]
124-
; CHECK-NEXT: zip2 v7.4s, v0.4s, v3.4s
125-
; CHECK-NEXT: ext v5.16b, v1.16b, v5.16b, #8
126-
; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s
125+
; CHECK-NEXT: mov v5.d[1], v7.d[1]
126+
; CHECK-NEXT: ext v16.16b, v1.16b, v17.16b, #8
127+
; CHECK-NEXT: mov v3.d[1], v7.d[1]
128+
; CHECK-NEXT: mov v19.d[1], v4.d[1]
127129
; CHECK-NEXT: mov v1.s[3], v2.s[2]
128-
; CHECK-NEXT: sub v3.4s, v6.4s, v16.4s
129-
; CHECK-NEXT: mov v17.d[1], v18.d[1]
130-
; CHECK-NEXT: mov v20.d[1], v4.d[1]
131-
; CHECK-NEXT: rev64 v6.4s, v3.4s
132-
; CHECK-NEXT: mov v0.d[1], v5.d[1]
133-
; CHECK-NEXT: mov v7.d[1], v1.d[1]
134-
; CHECK-NEXT: add v2.4s, v17.4s, v20.4s
135-
; CHECK-NEXT: add v5.4s, v3.4s, v6.4s
136-
; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s
137-
; CHECK-NEXT: sub v6.4s, v0.4s, v7.4s
138-
; CHECK-NEXT: add v0.4s, v7.4s, v0.4s
130+
; CHECK-NEXT: mov v0.d[1], v16.d[1]
131+
; CHECK-NEXT: sub v2.4s, v6.4s, v3.4s
132+
; CHECK-NEXT: add v3.4s, v5.4s, v19.4s
133+
; CHECK-NEXT: mov v18.d[1], v1.d[1]
134+
; CHECK-NEXT: rev64 v5.4s, v3.4s
139135
; CHECK-NEXT: rev64 v4.4s, v2.4s
140-
; CHECK-NEXT: rev64 v7.4s, v6.4s
141-
; CHECK-NEXT: rev64 v16.4s, v0.4s
136+
; CHECK-NEXT: sub v7.4s, v0.4s, v18.4s
137+
; CHECK-NEXT: add v0.4s, v18.4s, v0.4s
138+
; CHECK-NEXT: add v6.4s, v3.4s, v5.4s
139+
; CHECK-NEXT: rev64 v16.4s, v7.4s
140+
; CHECK-NEXT: rev64 v17.4s, v0.4s
141+
; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
142+
; CHECK-NEXT: rev64 v5.4s, v6.4s
142143
; CHECK-NEXT: add v1.4s, v2.4s, v4.4s
144+
; CHECK-NEXT: add v18.4s, v7.4s, v16.4s
145+
; CHECK-NEXT: add v19.4s, v0.4s, v17.4s
146+
; CHECK-NEXT: sub v7.4s, v7.4s, v16.4s
147+
; CHECK-NEXT: sub v0.4s, v0.4s, v17.4s
143148
; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s
144-
; CHECK-NEXT: ext v4.16b, v3.16b, v5.16b, #12
145-
; CHECK-NEXT: add v5.4s, v6.4s, v7.4s
146-
; CHECK-NEXT: add v17.4s, v0.4s, v16.4s
147-
; CHECK-NEXT: sub v0.4s, v0.4s, v16.4s
148-
; CHECK-NEXT: sub v6.4s, v6.4s, v7.4s
149-
; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #12
150-
; CHECK-NEXT: ext v5.16b, v6.16b, v5.16b, #12
151-
; CHECK-NEXT: rev64 v22.4s, v1.4s
152-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
153-
; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4
154-
; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
155-
; CHECK-NEXT: ext v18.16b, v7.16b, v0.16b, #4
156-
; CHECK-NEXT: ext v19.16b, v7.16b, v7.16b, #8
157-
; CHECK-NEXT: ext v20.16b, v5.16b, v6.16b, #4
158-
; CHECK-NEXT: ext v21.16b, v5.16b, v5.16b, #8
149+
; CHECK-NEXT: trn2 v4.4s, v5.4s, v3.4s
150+
; CHECK-NEXT: ext v5.16b, v2.16b, v1.16b, #12
151+
; CHECK-NEXT: ext v16.16b, v0.16b, v19.16b, #12
152+
; CHECK-NEXT: ext v17.16b, v7.16b, v18.16b, #12
153+
; CHECK-NEXT: ext v6.16b, v6.16b, v6.16b, #4
159154
; CHECK-NEXT: rev64 v5.4s, v5.4s
160-
; CHECK-NEXT: rev64 v7.4s, v7.4s
161-
; CHECK-NEXT: rev64 v4.4s, v4.4s
162-
; CHECK-NEXT: trn2 v1.4s, v2.4s, v1.4s
163-
; CHECK-NEXT: ext v16.16b, v16.16b, v17.16b, #12
164-
; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
165-
; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
166-
; CHECK-NEXT: trn2 v19.4s, v22.4s, v2.4s
167-
; CHECK-NEXT: ext v2.16b, v5.16b, v6.16b, #4
168-
; CHECK-NEXT: ext v0.16b, v7.16b, v0.16b, #4
169-
; CHECK-NEXT: ext v3.16b, v4.16b, v3.16b, #4
170-
; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #4
171-
; CHECK-NEXT: add v4.4s, v18.4s, v2.4s
172-
; CHECK-NEXT: add v5.4s, v17.4s, v0.4s
173-
; CHECK-NEXT: add v6.4s, v16.4s, v3.4s
174-
; CHECK-NEXT: add v7.4s, v19.4s, v1.4s
175-
; CHECK-NEXT: sub v2.4s, v18.4s, v2.4s
176-
; CHECK-NEXT: sub v0.4s, v17.4s, v0.4s
177-
; CHECK-NEXT: sub v1.4s, v19.4s, v1.4s
178-
; CHECK-NEXT: sub v3.4s, v16.4s, v3.4s
179-
; CHECK-NEXT: mov v7.d[1], v1.d[1]
180-
; CHECK-NEXT: mov v6.d[1], v3.d[1]
181-
; CHECK-NEXT: mov v4.d[1], v2.d[1]
182-
; CHECK-NEXT: mov v5.d[1], v0.d[1]
155+
; CHECK-NEXT: rev64 v16.4s, v16.4s
156+
; CHECK-NEXT: rev64 v17.4s, v17.4s
157+
; CHECK-NEXT: mov v1.s[3], v2.s[3]
158+
; CHECK-NEXT: mov v19.s[3], v0.s[3]
159+
; CHECK-NEXT: mov v18.s[3], v7.s[3]
160+
; CHECK-NEXT: ext v16.16b, v16.16b, v0.16b, #4
161+
; CHECK-NEXT: ext v17.16b, v17.16b, v7.16b, #4
162+
; CHECK-NEXT: ext v5.16b, v5.16b, v2.16b, #4
163+
; CHECK-NEXT: trn2 v3.4s, v3.4s, v6.4s
164+
; CHECK-NEXT: sub v20.4s, v19.4s, v16.4s
165+
; CHECK-NEXT: sub v21.4s, v18.4s, v17.4s
166+
; CHECK-NEXT: sub v6.4s, v1.4s, v5.4s
167+
; CHECK-NEXT: mov v18.s[0], v7.s[0]
168+
; CHECK-NEXT: mov v19.s[0], v0.s[0]
169+
; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #4
170+
; CHECK-NEXT: mov v1.s[0], v2.s[0]
171+
; CHECK-NEXT: add v2.4s, v18.4s, v17.4s
172+
; CHECK-NEXT: add v3.4s, v19.4s, v16.4s
173+
; CHECK-NEXT: add v7.4s, v4.4s, v0.4s
174+
; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s
175+
; CHECK-NEXT: add v1.4s, v1.4s, v5.4s
176+
; CHECK-NEXT: mov v7.d[1], v0.d[1]
177+
; CHECK-NEXT: mov v1.d[1], v6.d[1]
178+
; CHECK-NEXT: mov v2.d[1], v21.d[1]
179+
; CHECK-NEXT: mov v3.d[1], v20.d[1]
183180
; CHECK-NEXT: movi v0.8h, #1
184181
; CHECK-NEXT: movi v17.2d, #0x00ffff0000ffff
185-
; CHECK-NEXT: ushr v1.4s, v4.4s, #15
186-
; CHECK-NEXT: ushr v2.4s, v7.4s, #15
187-
; CHECK-NEXT: ushr v3.4s, v5.4s, #15
188-
; CHECK-NEXT: ushr v16.4s, v6.4s, #15
189-
; CHECK-NEXT: and v2.16b, v2.16b, v0.16b
182+
; CHECK-NEXT: ushr v4.4s, v2.4s, #15
183+
; CHECK-NEXT: ushr v5.4s, v7.4s, #15
184+
; CHECK-NEXT: ushr v6.4s, v3.4s, #15
185+
; CHECK-NEXT: ushr v16.4s, v1.4s, #15
186+
; CHECK-NEXT: and v5.16b, v5.16b, v0.16b
190187
; CHECK-NEXT: and v16.16b, v16.16b, v0.16b
191-
; CHECK-NEXT: and v3.16b, v3.16b, v0.16b
192-
; CHECK-NEXT: and v0.16b, v1.16b, v0.16b
193-
; CHECK-NEXT: mul v1.4s, v2.4s, v17.4s
194-
; CHECK-NEXT: mul v2.4s, v16.4s, v17.4s
188+
; CHECK-NEXT: and v6.16b, v6.16b, v0.16b
189+
; CHECK-NEXT: and v0.16b, v4.16b, v0.16b
190+
; CHECK-NEXT: mul v4.4s, v5.4s, v17.4s
191+
; CHECK-NEXT: mul v5.4s, v16.4s, v17.4s
195192
; CHECK-NEXT: mul v0.4s, v0.4s, v17.4s
196-
; CHECK-NEXT: mul v3.4s, v3.4s, v17.4s
197-
; CHECK-NEXT: add v7.4s, v1.4s, v7.4s
198-
; CHECK-NEXT: add v6.4s, v2.4s, v6.4s
199-
; CHECK-NEXT: add v4.4s, v0.4s, v4.4s
200-
; CHECK-NEXT: add v5.4s, v3.4s, v5.4s
201-
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
202-
; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b
203-
; CHECK-NEXT: eor v2.16b, v6.16b, v2.16b
204-
; CHECK-NEXT: eor v1.16b, v7.16b, v1.16b
205-
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
206-
; CHECK-NEXT: add v0.4s, v3.4s, v0.4s
193+
; CHECK-NEXT: mul v6.4s, v6.4s, v17.4s
194+
; CHECK-NEXT: add v7.4s, v4.4s, v7.4s
195+
; CHECK-NEXT: add v1.4s, v5.4s, v1.4s
196+
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
197+
; CHECK-NEXT: add v3.4s, v6.4s, v3.4s
198+
; CHECK-NEXT: eor v0.16b, v2.16b, v0.16b
199+
; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
200+
; CHECK-NEXT: eor v1.16b, v1.16b, v5.16b
201+
; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
202+
; CHECK-NEXT: add v1.4s, v3.4s, v1.4s
203+
; CHECK-NEXT: add v0.4s, v2.4s, v0.4s
207204
; CHECK-NEXT: add v0.4s, v1.4s, v0.4s
208205
; CHECK-NEXT: addv s0, v0.4s
209206
; CHECK-NEXT: fmov w8, s0

llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -939,9 +939,11 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
939939
define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
940940
; CHECK-LABEL: vselect_equivalent_shuffle_v4i16:
941941
; CHECK: // %bb.0:
942-
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #2
943-
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #4
944-
; CHECK-NEXT: ext v0.8b, v0.8b, v0.8b, #2
942+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
943+
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
944+
; CHECK-NEXT: mov v0.h[2], v1.h[1]
945+
; CHECK-NEXT: mov v0.h[1], v1.h[0]
946+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
945947
; CHECK-NEXT: ret
946948
%c = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
947949
ret <4 x i16> %c
@@ -950,9 +952,8 @@ define <4 x i16> @vselect_equivalent_shuffle_v4i16(<4 x i16> %a, <4 x i16> %b) {
950952
define <4 x i32> @vselect_equivalent_shuffle_v4i32(<4 x i32> %a, <4 x i32> %b) {
951953
; CHECK-LABEL: vselect_equivalent_shuffle_v4i32:
952954
; CHECK: // %bb.0:
953-
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4
954-
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #8
955-
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #4
955+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
956+
; CHECK-NEXT: mov v0.s[2], v1.s[1]
956957
; CHECK-NEXT: ret
957958
%c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 3>
958959
ret <4 x i32> %c

llvm/test/CodeGen/AArch64/neon-wide-splat.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,8 @@ define <8 x i8> @shuffle_not1(<16 x i8> %v) {
109109
define <4 x i32> @shuffle_not2(<4 x i32> %v) {
110110
; CHECK-LABEL: shuffle_not2:
111111
; CHECK: // %bb.0: // %entry
112-
; CHECK-NEXT: trn1 v1.4s, v0.4s, v0.4s
113-
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
114-
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
112+
; CHECK-NEXT: mov v0.s[3], v0.s[2]
113+
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v0.4s
115114
; CHECK-NEXT: ret
116115
entry:
117116
%res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 2>

llvm/test/CodeGen/AArch64/shuffle-tbl34.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,8 @@ define <8 x i16> @shuffle3_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) {
517517
define <4 x i32> @shuffle3_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
518518
; CHECK-LABEL: shuffle3_v4i32:
519519
; CHECK: // %bb.0:
520-
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
521-
; CHECK-NEXT: uzp1 v1.4s, v0.4s, v1.4s
522-
; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8
520+
; CHECK-NEXT: zip1 v0.4s, v0.4s, v0.4s
521+
; CHECK-NEXT: mov v0.s[1], v1.s[0]
523522
; CHECK-NEXT: dup v1.4s, v2.s[0]
524523
; CHECK-NEXT: mov v0.s[2], v1.s[2]
525524
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)