[AArch64][GlobalISel] Ensure we have a insert-subreg v4i32 GPR pattern #142724

davemgreen · 2025-06-04T06:47:50Z

This is the GISel equivalent of scalar_to_vector, making sure that when we insert into undef we use a fmov that avoids the artificial dependency on the previous register. This adds v2i32 and v2i64 patterns too for similar reasons.

llvmbot · 2025-06-04T06:48:25Z

@llvm/pr-subscribers-llvm-globalisel

@llvm/pr-subscribers-backend-aarch64

Author: David Green (davemgreen)

Changes

This is the GISel equivalent of scalar_to_vector, making sure that when we insert into undef we use a fmov that avoids the artificial dependency on the previous register. This adds v2i32 and v2i64 patterns too for similar reasons.

Patch is 408.72 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142724.diff

46 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+7)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll (+1-1)
(modified) llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir (+4-7)
(modified) llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll (+1-1)
(modified) llvm/test/CodeGen/AArch64/aarch64-smull.ll (+56-56)
(modified) llvm/test/CodeGen/AArch64/abs.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/arm64-dup.ll (+18-39)
(modified) llvm/test/CodeGen/AArch64/arm64-fp128.ll (+7-7)
(modified) llvm/test/CodeGen/AArch64/arm64-neon-copy.ll (+12-28)
(modified) llvm/test/CodeGen/AArch64/arm64-neon-mul-div-cte.ll (+6-6)
(modified) llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll (+18-38)
(modified) llvm/test/CodeGen/AArch64/bitcast-extend.ll (+18-18)
(modified) llvm/test/CodeGen/AArch64/bitcast.ll (+18-29)
(modified) llvm/test/CodeGen/AArch64/bswap.ll (+1-2)
(modified) llvm/test/CodeGen/AArch64/concat-vector.ll (+9-19)
(modified) llvm/test/CodeGen/AArch64/ctlz.ll (+11-11)
(modified) llvm/test/CodeGen/AArch64/cttz.ll (+9-9)
(modified) llvm/test/CodeGen/AArch64/fcmp.ll (+104-104)
(modified) llvm/test/CodeGen/AArch64/fcopysign.ll (+2-2)
(modified) llvm/test/CodeGen/AArch64/fptoi.ll (+10-10)
(modified) llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll (+13-24)
(modified) llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll (+13-24)
(modified) llvm/test/CodeGen/AArch64/freeze.ll (+2-3)
(modified) llvm/test/CodeGen/AArch64/fsh.ll (+337-337)
(modified) llvm/test/CodeGen/AArch64/icmp.ll (+8-8)
(modified) llvm/test/CodeGen/AArch64/insertextract.ll (+7-15)
(modified) llvm/test/CodeGen/AArch64/itofp.ll (+4-4)
(modified) llvm/test/CodeGen/AArch64/mul.ll (+17-17)
(modified) llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll (+2-2)
(modified) llvm/test/CodeGen/AArch64/neon-compare-instructions.ll (+1-1)
(modified) llvm/test/CodeGen/AArch64/neon-dotreduce.ll (+2496-2272)
(modified) llvm/test/CodeGen/AArch64/phi.ll (+15-14)
(modified) llvm/test/CodeGen/AArch64/popcount.ll (+8-14)
(modified) llvm/test/CodeGen/AArch64/ptradd.ll (+16-25)
(modified) llvm/test/CodeGen/AArch64/rem.ll (+314-306)
(modified) llvm/test/CodeGen/AArch64/select_cc.ll (+1-1)
(modified) llvm/test/CodeGen/AArch64/sext.ll (+42-42)
(modified) llvm/test/CodeGen/AArch64/shift.ll (+3-6)
(modified) llvm/test/CodeGen/AArch64/shufflevector.ll (+8-14)
(modified) llvm/test/CodeGen/AArch64/trunc.ll (+8-14)
(modified) llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll (+26-26)
(modified) llvm/test/CodeGen/AArch64/vecreduce-add.ll (+148-114)
(modified) llvm/test/CodeGen/AArch64/xtn.ll (+26-53)
(modified) llvm/test/CodeGen/AArch64/zext.ll (+43-43)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 727831896737d..3cea0b58c4439 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -7269,6 +7269,13 @@ def : Pat<(v4i16 (vec_ins_or_scal_vec GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
+def : Pat<(v2i32 (vec_ins_or_scal_vec GPR32:$Rn)),
+          (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v4i32 (vec_ins_or_scal_vec GPR32:$Rn)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GPR32:$Rn, ssub)>;
+def : Pat<(v2i64 (vec_ins_or_scal_vec GPR64:$Rn)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GPR64:$Rn, dsub)>;
+
 def : Pat<(v4f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 def : Pat<(v8f16 (vec_ins_or_scal_vec (f16 FPR16:$Rn))),
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
index be80886ed3efe..ba53cb57c2ef2 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector-widen-crash.ll
@@ -11,7 +11,7 @@ define i32 @bar() {
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    umov.b w8, v0[0]
 ; CHECK-NEXT:    umov.b w9, v0[1]
-; CHECK-NEXT:    mov.s v1[0], w8
+; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    umov.b w8, v0[2]
 ; CHECK-NEXT:    mov.s v1[1], w9
 ; CHECK-NEXT:    umov.b w9, v0[3]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
index 0115531dfb09a..22d1ccc056eb4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-build-vector-to-dup.mir
@@ -57,15 +57,12 @@ body:             |
     ; SELECT-NEXT: %r:gpr32 = COPY $w0
     ; SELECT-NEXT: %q:gpr32 = COPY $w1
     ; SELECT-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF
+    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], %r, %subreg.ssub
     ; SELECT-NEXT: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; SELECT-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[DEF]], %subreg.dsub
-    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG]], 0, %r
+    ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[INSERT_SUBREG]], %subreg.dsub
+    ; SELECT-NEXT: [[INSvi32gpr:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
     ; SELECT-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr]].dsub
-    ; SELECT-NEXT: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; SELECT-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
-    ; SELECT-NEXT: [[INSvi32gpr1:%[0-9]+]]:fpr128 = INSvi32gpr [[INSERT_SUBREG1]], 1, %q
-    ; SELECT-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY [[INSvi32gpr1]].dsub
-    ; SELECT-NEXT: $d0 = COPY [[COPY1]]
+    ; SELECT-NEXT: $d0 = COPY [[COPY]]
     ; SELECT-NEXT: RET_ReallyLR implicit $d0
     %r:_(s32) = COPY $w0
     %q:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 9734ab35bd6b2..7f922c0047553 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -76,8 +76,7 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    bic w9, w9, w8
 ; CHECK-GI-NEXT:    and w8, w8, w10
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    mov v0.s[0], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
   %neg = xor <1 x i32> %C, <i32 -1>
   %and = and <1 x i32> %neg, %B
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 45ad4b07ff66f..b8eb8269d605c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -76,8 +76,7 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ; CHECK-GI-NEXT:    and w9, w8, w9
 ; CHECK-GI-NEXT:    bic w8, w10, w8
 ; CHECK-GI-NEXT:    orr w8, w9, w8
-; CHECK-GI-NEXT:    mov v0.s[0], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
   %and = and <1 x i32> %C, %B
   %neg = xor <1 x i32> %C, <i32 -1>
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 418113a4e4e09..8655bb1292ef7 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -204,7 +204,7 @@ define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture r
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldrh w9, [x2], #16
 ; CHECK-GI-NEXT:    subs x8, x8, #8
-; CHECK-GI-NEXT:    mov v2.s[0], w9
+; CHECK-GI-NEXT:    fmov s2, w9
 ; CHECK-GI-NEXT:    mov w9, w0
 ; CHECK-GI-NEXT:    add w0, w0, #8
 ; CHECK-GI-NEXT:    lsl x9, x9, #2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index d6fd4c4110a12..2f23a32c36a9f 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -2282,14 +2282,14 @@ define <2 x i64> @asr(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
 ; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #32
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
     %x = ashr <2 x i64> %a, <i64 32, i64 32>
     %y = ashr <2 x i64> %b, <i64 32, i64 32>
@@ -2317,14 +2317,14 @@ define <2 x i64> @asr_const(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI81_0
 ; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #32
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI81_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
     %x = ashr <2 x i64> %a, <i64 32, i64 32>
     %z = mul nsw <2 x i64> %x, <i64 31, i64 31>
@@ -2799,14 +2799,14 @@ define <2 x i64> @sdistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    sshll v2.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    saddl v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = sext <2 x i32> %src1 to <2 x i64>
@@ -2838,14 +2838,14 @@ define <2 x i64> @sdistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) {
 ; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI101_0]
 ; CHECK-GI-NEXT:    saddw v0.2d, v2.2d, v0.2s
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = sext <2 x i32> %src1 to <2 x i64>
@@ -2875,14 +2875,14 @@ define <2 x i64> @sdistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI102_0
 ; CHECK-GI-NEXT:    saddl v0.2d, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI102_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = sext <2 x i32> %src1 to <2 x i64>
@@ -2909,14 +2909,14 @@ define <2 x i64> @udistribute_v2i32(<2 x i32> %src1, <2 x i32> %src2, <2 x i32>
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ushll v2.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d2
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v2.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d2
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v2.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = zext <2 x i32> %src1 to <2 x i64>
@@ -2948,14 +2948,14 @@ define <2 x i64> @udistribute_const1_v2i32(<2 x i32> %src1, <2 x i32> %mul) {
 ; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-GI-NEXT:    ldr q2, [x8, :lo12:.LCPI104_0]
 ; CHECK-GI-NEXT:    uaddw v0.2d, v2.2d, v0.2s
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = zext <2 x i32> %src1 to <2 x i64>
@@ -2985,14 +2985,14 @@ define <2 x i64> @udistribute_const2_v2i32(<2 x i32> %src1, <2 x i32> %src2) {
 ; CHECK-GI-NEXT:    adrp x8, .LCPI105_0
 ; CHECK-GI-NEXT:    uaddl v0.2d, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI105_0]
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    fmov x9, d1
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    mov x11, v1.d[1]
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    fmov x11, d1
+; CHECK-GI-NEXT:    mov x8, v0.d[1]
+; CHECK-GI-NEXT:    mov x9, v1.d[1]
+; CHECK-GI-NEXT:    mul x10, x10, x11
 ; CHECK-GI-NEXT:    mul x8, x8, x9
-; CHECK-GI-NEXT:    mul x9, x10, x11
-; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    fmov d0, x10
+; CHECK-GI-NEXT:    mov v0.d[1], x8
 ; CHECK-GI-NEXT:    ret
 entry:
   %4 = zext <2 x i32> %src1 to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index d501d9ed24547..0f56d25a47b2a 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -247,8 +247,7 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
 ; CHECK-GI-NEXT:    cneg w8, w9, le
-; CHECK-GI-NEXT:    mov v0.s[0], w8
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
 entry:
   %res = call <1 x i32> @llvm.abs.v1i32(<1 x i32> %a, i1 0)
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 4c28ea7592202..c279cf0f241d2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -334,40 +334,25 @@ entry:
 }
 
 define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-SD-LABEL: f:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    mov.s v0[1], w1
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: f:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.s v0[0], w0
-; CHECK-GI-NEXT:    mov.s v0[1], w1
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov.s v0[1], w1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
   %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
   ret <2 x i32> %vecinit1
 }
 
 define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
-; CHECK-SD-LABEL: g:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    mov.s v0[1], w1
-; CHECK-SD-NEXT:    mov.s v0[2], w1
-; CHECK-SD-NEXT:    mov.s v0[3], w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: g:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.s v0[0], w0
-; CHECK-GI-NEXT:    mov.s v0[1], w1
-; CHECK-GI-NEXT:    mov.s v0[2], w1
-; CHECK-GI-NEXT:    mov.s v0[3], w0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: g:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    mov.s v0[1], w1
+; CHECK-NEXT:    mov.s v0[2], w1
+; CHECK-NEXT:    mov.s v0[3], w0
+; CHECK-NEXT:    ret
   %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
   %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
   %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
@@ -376,17 +361,11 @@ define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
 }
 
 define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
-; CHECK-SD-LABEL: h:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov d0, x0
-; CHECK-SD-NEXT:    mov.d v0[1], x1
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: h:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov.d v0[0], x0
-; CHECK-GI-NEXT:    mov.d v0[1], x1
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov.d v0[1], x1
+; CHECK-NEXT:    ret
   %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
   %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
   ret <2 x i64> %vecinit1
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index 7eb26096ed156..a75f6419d5a5a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -618,7 +618,7 @@ define <2 x i32> @vec_fptosi_32(<2 x fp128> %val) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixtfsi
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    fmov s0, w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -661,7 +661,7 @@ define <2 x i64> @vec_fptosi_64(<2 x fp128> %val) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixtfdi
-; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    fmov d0, x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -702,7 +702,7 @@ define <2 x i32> @vec_fptoui_32(<2 x fp128> %val) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __fixunstfsi
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    fmov s0, w19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.s[1], w0
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
@@ -745,7 +745,7 @@ define <2 x i64> @vec_fptoui_64(<2 x fp128> %val) {
 ; CHECK-GI-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov x19, x0
 ; CHECK-GI-NEXT:    bl __fixunstfdi
-; CHECK-GI-NEXT:    mov v0.d[0], x19
+; CHECK-GI-NEXT:    fmov d0, x19
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-GI-NEXT:    mov v0.d[1], x0
 ; CHECK-GI-NEXT:    add sp, sp, #32
@@ -977,7 +977,7 @@ define <2 x i1> @vec_setcc1(<2 x fp128> %lhs, <2 x fp128> %rhs) {
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w19, le
 ; CHECK-GI-NEXT:    bl __letf2
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    fmov s0, w19
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w8, le
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
@@ -1032,7 +1032,7 @@ define <2 x i1> @vec_setcc2(<2 x fp128> %lhs, <2 x fp128> %rhs) {
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w19, gt
 ; CHECK-GI-NEXT:    bl __letf2
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    fmov s0, w19
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w8, gt
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #32] // 16-byte Folded Reload
@@ -1109,7 +1109,7 @@ define <2 x i1> @vec_setcc3(<2 x fp128> %lhs, <2 x fp128> %rhs) {
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    cset w20, eq
 ; CHECK-GI-NEXT:    bl __unordtf2
-; CHECK-GI-NEXT:    mov v0.s[0], w19
+; CHECK-GI-NEXT:    fmov s0, w19
 ; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-GI-NEXT:    cset w8, ne
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 51f1351a5edf4..ddd8a72618b1e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1198,44 +1198,28 @@ define <8 x i16> @scalar_to_vector.v8i16(i16 %a) {
 }
 
 define <2 x i32> @scalar_to_vector.v2i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v2i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v0.s[0], w0
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ret
   %b = insertelement <2 x i32> undef, i32 %a, i32 0
   ret <2 x i32> %b
 }
 
 define <4 x i32> @scalar_to_vector.v4i32(i32 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v4i32:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    fmov s0, w0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: scalar_to_vector.v4i32:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov v0.s[0], w0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: scalar_to_vector.v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ret
   %b = insertelement <4 x i32> undef, i32 %a, i32 0
   ret <4 x i32> %b
 }
 
 define <2 x i64> @scalar_to_vector.v2i64(i64 %a) {
-; CHECK-SD-LABEL: scalar_to_vector.v2i64:
-; CHECK-SD:      ...
[truncated]

llvm/lib/Target/AArch64/AArch64InstrInfo.td

This is the GISel equivalent of scalar_to_vector, making sure that when we insert into undef we use a fmov that avoids the artificial dependency on the previous register. This adds v2i32 and v2i64 patterns too for similar reasons.

c-rhodes

LGTM cheers

davemgreen requested review from aemerson, SamTebbs33, c-rhodes, nasherm and jyli0116 June 4, 2025 06:47

llvmbot added backend:AArch64 llvm:globalisel labels Jun 4, 2025

c-rhodes reviewed Jun 4, 2025

View reviewed changes

llvm/lib/Target/AArch64/AArch64InstrInfo.td Show resolved Hide resolved

davemgreen force-pushed the gh-gi-insertsubregpat branch from 631ccab to c73b0ff Compare June 4, 2025 17:41

c-rhodes approved these changes Jun 5, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64][GlobalISel] Ensure we have a insert-subreg v4i32 GPR pattern #142724

[AArch64][GlobalISel] Ensure we have a insert-subreg v4i32 GPR pattern #142724

Uh oh!

davemgreen commented Jun 4, 2025

Uh oh!

llvmbot commented Jun 4, 2025 •

edited

Loading

Uh oh!

Uh oh!

c-rhodes left a comment

Uh oh!

Uh oh!

[AArch64][GlobalISel] Ensure we have a insert-subreg v4i32 GPR pattern #142724

Are you sure you want to change the base?

[AArch64][GlobalISel] Ensure we have a insert-subreg v4i32 GPR pattern #142724

Uh oh!

Conversation

davemgreen commented Jun 4, 2025

Uh oh!

llvmbot commented Jun 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

c-rhodes left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvmbot commented Jun 4, 2025 •

edited

Loading