-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 #142731
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge.
@llvm/pr-subscribers-backend-aarch64 Author: David Green (davemgreen) ChangesWe will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge. Patch is 60.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142731.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f84e83816bf33..9fe331d5370de 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -172,6 +172,20 @@ def form_duplane : GICombineRule <
(apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
>;
+// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
+class unmerge_duplane<Instruction Op> : GICombineRule <
+ (defs root:$root),
+ (match (Op $a, $src, $c),
+ (G_UNMERGE_VALUES $d1, $d2, $a):$root,
+ [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
+ (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
+>;
+def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
+def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
+def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
+def unmerge_duplane64 : unmerge_duplane<G_DUPLANE64>;
+def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16, unmerge_duplane32, unmerge_duplane64]>;
+
def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
form_duplane, shuf_to_ins]>;
@@ -325,7 +339,8 @@ def AArch64PostLegalizerLowering
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mulv2s64,
- vector_unmerge_lowering, insertelt_nonconst]> {
+ vector_unmerge_lowering, insertelt_nonconst,
+ unmerge_duplanes]> {
}
// Post-legalization combines which are primarily optimizations.
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 4c28ea7592202..12bf09e02aaf9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -422,16 +422,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
; the formation of an indexed-by-7 MLS.
define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-SD-LABEL: test_high_splat:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mls.4h v0, v1, v2[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_high_splat:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup.8h v2, v2[7]
-; CHECK-GI-NEXT: mls.4h v0, v2, v1
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_high_splat:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mls.4h v0, v1, v2[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index c3ad3b4192cf9..85d8b7c3e2866 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -159,16 +159,10 @@ entry:
}
define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mla v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b
@@ -189,16 +183,10 @@ entry:
}
define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %b
@@ -271,16 +259,10 @@ entry:
}
define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b
@@ -301,16 +283,10 @@ entry:
}
define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %b
@@ -427,16 +403,10 @@ entry:
}
define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %a
@@ -455,16 +425,10 @@ entry:
}
define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %a
@@ -483,16 +447,10 @@ entry:
}
define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %a
@@ -511,16 +469,10 @@ entry:
}
define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%mul = mul <2 x i32> %shuffle, %a
@@ -567,16 +519,10 @@ entry:
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: fmla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
%0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -834,16 +780,10 @@ entry:
}
define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: smlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -852,16 +792,10 @@ entry:
}
define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: smlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
; CHECK-GI-LABEL: test_vmlal_high_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
; CHECK-GI-LABEL: test_vmlal_high_laneq_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[3]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -978,16 +910,10 @@ entry:
}
define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: smlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -996,16 +922,10 @@ entry:
}
define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: smlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[3]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1122,16 +1040,10 @@ entry:
}
define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: umlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1140,16 +1052,10 @@ entry:
}
define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: umlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
; CHECK-GI-LABEL: test_vmlal_high_laneq_u16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
; CHECK-GI-LABEL: test_vmlal_high_laneq_u32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[3]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1266,16 +1170,10 @@ entry:
}
define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: umlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1284,16 +1182,10 @@ entry:
}
define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: umlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
%vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[7]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov d1, v1.d[1]
-; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[3]
; CHECK-GI-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1512,16 +1402,10 @@ entry:
}
define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: du...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
missing test coverage for 64-bit, but otherwise LGTM cheers
We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge.