Skip to content

[AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 #142731

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

davemgreen
Copy link
Collaborator

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge.

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle
vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can
handle different vector sizes (128bit and 64bit output, for example), and we
can combine away the unmerge.
@llvmbot
Copy link
Member

llvmbot commented Jun 4, 2025

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of shuffle vector splats with mismatching vector sizes. The G_DUPLANE intrinsics can handle different vector sizes (128bit and 64bit output, for example), and we can combine away the unmerge.


Patch is 60.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142731.diff

5 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+16-1)
  • (modified) llvm/test/CodeGen/AArch64/arm64-dup.ll (+4-10)
  • (modified) llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll (+228-556)
  • (modified) llvm/test/CodeGen/AArch64/arm64-neon-copy.ll (+12-30)
  • (modified) llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll (+16-40)
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index f84e83816bf33..9fe331d5370de 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -172,6 +172,20 @@ def form_duplane : GICombineRule <
   (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
 >;
 
+// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
+class unmerge_duplane<Instruction Op> : GICombineRule <
+  (defs root:$root),
+  (match (Op $a, $src, $c),
+         (G_UNMERGE_VALUES $d1, $d2, $a):$root,
+         [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
+  (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
+>;
+def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
+def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
+def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
+def unmerge_duplane64 : unmerge_duplane<G_DUPLANE64>;
+def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16, unmerge_duplane32, unmerge_duplane64]>;
+
 def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn, fullrev,
                                               form_duplane, shuf_to_ins]>;
 
@@ -325,7 +339,8 @@ def AArch64PostLegalizerLowering
                         lower_vector_fcmp, form_truncstore,
                         vector_sext_inreg_to_shift,
                         unmerge_ext_to_unmerge, lower_mulv2s64,
-                        vector_unmerge_lowering, insertelt_nonconst]> {
+                        vector_unmerge_lowering, insertelt_nonconst,
+                        unmerge_duplanes]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 4c28ea7592202..12bf09e02aaf9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -422,16 +422,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
 ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
 ; the formation of an indexed-by-7 MLS.
 define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
-; CHECK-SD-LABEL: test_high_splat:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls.4h v0, v1, v2[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_high_splat:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup.8h v2, v2[7]
-; CHECK-GI-NEXT:    mls.4h v0, v2, v1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_high_splat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls.4h v0, v1, v2[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index c3ad3b4192cf9..85d8b7c3e2866 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -159,16 +159,10 @@ entry:
 }
 
 define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mla v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -189,16 +183,10 @@ entry:
 }
 
 define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmla_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmla_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mla v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmla_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -271,16 +259,10 @@ entry:
 }
 
 define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.4h, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    mls v0.4h, v2.4h, v1.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.4h, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %b
@@ -301,16 +283,10 @@ entry:
 }
 
 define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmls_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mls v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmls_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    mls v0.2s, v2.2s, v1.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmls_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mls v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %b
@@ -427,16 +403,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -455,16 +425,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -483,16 +447,10 @@ entry:
 }
 
 define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.8h, v1.h[7]
-; CHECK-GI-NEXT:    mul v0.4h, v1.4h, v0.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.4h, v0.4h, v1.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %mul = mul <4 x i16> %shuffle, %a
@@ -511,16 +469,10 @@ entry:
 }
 
 define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmul_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmul_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v1.4s, v1.s[3]
-; CHECK-GI-NEXT:    mul v0.2s, v1.2s, v0.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmul_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mul v0.2s, v0.2s, v1.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %mul = mul <2 x i32> %shuffle, %a
@@ -567,16 +519,10 @@ entry:
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
-; CHECK-SD-LABEL: test_vfma_laneq_f32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vfma_laneq_f32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    fmla v0.2s, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vfma_laneq_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
   %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a)
@@ -834,16 +780,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -852,16 +792,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -978,16 +910,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -996,16 +922,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_s32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_s32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    smlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    smlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1122,16 +1040,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1140,16 +1052,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlal_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlal_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlal_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlal v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlal v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1266,16 +1170,10 @@ entry:
 }
 
 define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
   %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
@@ -1284,16 +1182,10 @@ entry:
 }
 
 define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) {
-; CHECK-SD-LABEL: test_vmlsl_laneq_u32:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmlsl_laneq_u32:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_vmlsl_laneq_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
+; CHECK-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
   %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
@@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.8h, v2.h[7]
-; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.4h
+; CHECK-GI-NEXT:    umlsl v0.4s, v1.4h, v2.h[7]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32
 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov d1, v1.d[1]
-; CHECK-GI-NEXT:    dup v2.4s, v2.s[3]
-; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.2s
+; CHECK-GI-NEXT:    umlsl v0.2d, v1.2s, v2.s[3]
 ; CHECK-GI-NEXT:    ret
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -1512,16 +1402,10 @@ entry:
 }
 
 define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) {
-; CHECK-SD-LABEL: test_vmull_laneq_s16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    smull v0.4s, v0.4h, v1.h[7]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_vmull_laneq_s16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    du...
[truncated]

Copy link
Collaborator

@c-rhodes c-rhodes left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing test coverage for 64-bit, but otherwise LGTM cheers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants